feat(04): migrate facade to spiderJobs.platforms.* + asyncio bridge; delete jobs_spider/
Plan 01 - facade migration (ARCH-06/07):
- boss.py: import from spiderJobs.platforms.boss.{api,client,sign}
- qcwy.py: import from spiderJobs.platforms.job51.{api,client}
- zhilian.py: import from spiderJobs.platforms.zhilian.{api,client,sign}
- All 3 Service classes: +4 async_* methods via asyncio.to_thread()
Plan 02 - deprecation + cleanup (ARCH-08):
- 11 private copy files (_base, _http_client, _boss/job51/zhilian *): DEPRECATED header
- jobs_spider/ directory: fully deleted (user request)
Full regression: 106 passed in 0.61s
This commit is contained in:
parent
2e11edcef8
commit
3aadbd128b
116
app/services/crawler/_base.py
Normal file
116
app/services/crawler/_base.py
Normal file
@ -0,0 +1,116 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
通用基类与数据结构
|
||||
复制自 spiderJobs/core/base.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from app.services.crawler._http_client import HTTPClient
|
||||
|
||||
|
||||
@dataclass
|
||||
class ApiResult:
|
||||
success: bool
|
||||
status_code: int
|
||||
data: Any = None
|
||||
list: list[dict] = field(default_factory=list)
|
||||
count: int = 0
|
||||
is_end_page: bool = True
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
def parse_response(http_code: int, raw: Any) -> ApiResult:
|
||||
biz_code = raw.get("statusCode") if isinstance(raw, dict) else http_code
|
||||
|
||||
if http_code != 200 or biz_code != 200:
|
||||
return ApiResult(
|
||||
success=False,
|
||||
status_code=biz_code or http_code,
|
||||
error=(
|
||||
raw.get("statusDescription")
|
||||
or raw.get("message")
|
||||
or f"请求失败: {biz_code}"
|
||||
) if isinstance(raw, dict) else f"请求失败: {http_code}",
|
||||
)
|
||||
|
||||
payload = (raw.get("data") or {}) if isinstance(raw, dict) else {}
|
||||
|
||||
if isinstance(payload, dict) and "list" in payload:
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=payload.get("list", []),
|
||||
count=payload.get("count", 0),
|
||||
is_end_page=payload.get("isEndPage", True),
|
||||
)
|
||||
|
||||
return ApiResult(success=True, status_code=200, data=payload)
|
||||
|
||||
|
||||
class BaseFetcher:
|
||||
ENDPOINT: str = ""
|
||||
|
||||
def __init__(self, http_client: HTTPClient):
|
||||
self._http = http_client
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
raise NotImplementedError
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return parse_response(http_code, raw)
|
||||
|
||||
def fetch(self) -> ApiResult:
|
||||
try:
|
||||
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
|
||||
class BaseSearcher:
|
||||
ENDPOINT: str = ""
|
||||
|
||||
def __init__(self, page_size: int = 15, http_client: HTTPClient = None):
|
||||
self.page_size = page_size
|
||||
self._http = http_client
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
raise NotImplementedError
|
||||
|
||||
def _request(self, params: dict) -> tuple[int, Any]:
|
||||
return self._http.post(self.ENDPOINT, params)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return parse_response(http_code, raw)
|
||||
|
||||
def search(self, page_index: int = 1) -> ApiResult:
|
||||
params = self._build_params(page_index)
|
||||
try:
|
||||
http_code, data = self._request(params)
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
def load_all(
|
||||
self,
|
||||
max_pages: int = 10,
|
||||
on_page: Optional[Callable[[ApiResult, int], None]] = None,
|
||||
) -> list[dict]:
|
||||
all_list: list[dict] = []
|
||||
for page_index in range(1, max_pages + 1):
|
||||
result = self.search(page_index=page_index)
|
||||
if not result.success:
|
||||
break
|
||||
all_list.extend(result.list)
|
||||
if on_page:
|
||||
on_page(result, page_index)
|
||||
if result.is_end_page:
|
||||
break
|
||||
return all_list
|
||||
182
app/services/crawler/_boss_api.py
Normal file
182
app/services/crawler/_boss_api.py
Normal file
@ -0,0 +1,182 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
Boss直聘 - 所有 API 接口
|
||||
复制自 spiderJobs/platforms/boss/api.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from app.services.crawler._base import ApiResult, BaseFetcher, BaseSearcher
|
||||
from app.services.crawler._boss_client import BossClient, create_client
|
||||
|
||||
|
||||
def _parse_boss_response(http_code: int, raw: Any) -> ApiResult:
|
||||
if http_code != 200:
|
||||
return ApiResult(success=False, status_code=http_code, error=f"HTTP 请求失败: {http_code}")
|
||||
|
||||
if not isinstance(raw, dict):
|
||||
return ApiResult(success=False, status_code=http_code, error="响应格式异常")
|
||||
|
||||
biz_code = raw.get("code", -1)
|
||||
if biz_code != 0:
|
||||
return ApiResult(
|
||||
success=False, status_code=biz_code,
|
||||
error=raw.get("message") or f"业务错误: {biz_code}",
|
||||
)
|
||||
|
||||
payload = raw.get("zpData") or {}
|
||||
|
||||
if isinstance(payload, dict) and "jobList" in payload:
|
||||
job_list = payload.get("jobList", [])
|
||||
has_more = payload.get("hasMore", False)
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=job_list, count=len(job_list), is_end_page=not has_more,
|
||||
)
|
||||
|
||||
if isinstance(payload, dict) and "list" in payload:
|
||||
items = payload.get("list", [])
|
||||
has_more = payload.get("hasMore", False)
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=items, count=len(items), is_end_page=not has_more,
|
||||
)
|
||||
|
||||
return ApiResult(success=True, status_code=200, data=payload)
|
||||
|
||||
|
||||
class SearchRecJobs(BaseSearcher):
|
||||
ENDPOINT = "/wapi/zpgeek/miniapp/homepage/recjoblist.json"
|
||||
|
||||
def __init__(
|
||||
self, *, city_code: str = "101280600", sort_type: int = 1,
|
||||
district_code: str = "", blue_welfare: str = "",
|
||||
encrypt_expect_id: str = "", page_size: int = 15,
|
||||
client: Optional[BossClient] = None,
|
||||
):
|
||||
super().__init__(page_size=page_size, http_client=client or create_client())
|
||||
self.city_code = city_code
|
||||
self.sort_type = sort_type
|
||||
self.district_code = district_code
|
||||
self.blue_welfare = blue_welfare
|
||||
self.encrypt_expect_id = encrypt_expect_id
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
return {
|
||||
"cityCode": self.city_code, "sortType": self.sort_type,
|
||||
"page": page_index, "pageSize": self.page_size,
|
||||
"encryptExpectId": self.encrypt_expect_id,
|
||||
"districtCode": self.district_code,
|
||||
"blueWelfare": self.blue_welfare, "appId": 10002,
|
||||
}
|
||||
|
||||
def _request(self, params: dict) -> tuple[int, Any]:
|
||||
return self._http.get(self.ENDPOINT, params)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_boss_response(http_code, raw)
|
||||
|
||||
|
||||
class GetJobDetail(BaseFetcher):
|
||||
ENDPOINT = "/wapi/batch/requests"
|
||||
|
||||
def __init__(
|
||||
self, *, security_id: str, job_id: str, lid: str = "",
|
||||
source: int = 10, client: Optional[BossClient] = None,
|
||||
):
|
||||
super().__init__(http_client=client or create_client())
|
||||
self.security_id = security_id
|
||||
self.job_id = job_id
|
||||
self.lid = lid
|
||||
self.source = source
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {}
|
||||
|
||||
def fetch(self) -> ApiResult:
|
||||
detail_query = urlencode({
|
||||
"securityId": self.security_id, "jobId": self.job_id,
|
||||
"lid": self.lid, "source": self.source,
|
||||
})
|
||||
improvement_query = urlencode({
|
||||
"securityId": self.security_id, "jobId": self.job_id, "lid": self.lid,
|
||||
})
|
||||
sub_reqs = [
|
||||
{"path": "/wapi/zpgeek/miniapp/job/detail.json", "method": "GET", "query": detail_query},
|
||||
{"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json", "method": "GET", "query": improvement_query},
|
||||
]
|
||||
try:
|
||||
client: BossClient = self._http
|
||||
http_code, data = client.batch(sub_reqs)
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
if http_code != 200:
|
||||
return ApiResult(success=False, status_code=http_code, error=f"HTTP 请求失败: {http_code}")
|
||||
if not isinstance(raw, dict):
|
||||
return ApiResult(success=False, status_code=http_code, error="响应格式异常")
|
||||
biz_code = raw.get("code", -1)
|
||||
if biz_code != 0:
|
||||
return ApiResult(success=False, status_code=biz_code, error=raw.get("message") or f"业务错误: {biz_code}")
|
||||
zp_data = raw.get("zpData") or {}
|
||||
detail = zp_data.get("/wapi/zpgeek/miniapp/job/detail.json", {})
|
||||
improvement = zp_data.get("/wapi/zpgeek/miniapp/jobdetail/improvement/query.json", {})
|
||||
merged = {
|
||||
"detail": detail.get("zpData") if isinstance(detail, dict) else detail,
|
||||
"improvement": improvement.get("zpData") if isinstance(improvement, dict) else improvement,
|
||||
}
|
||||
return ApiResult(success=True, status_code=200, data=merged)
|
||||
|
||||
|
||||
class GetBrandDetail(BaseFetcher):
|
||||
ENDPOINT = "/wapi/zpgeek/miniapp/brand/detail.json"
|
||||
|
||||
def __init__(self, *, brand_id: str, client: Optional[BossClient] = None):
|
||||
super().__init__(http_client=client or create_client())
|
||||
self.brand_id = brand_id
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {"brandId": self.brand_id, "appId": 10002}
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_boss_response(http_code, raw)
|
||||
|
||||
|
||||
class SearchBrandJobs(BaseSearcher):
|
||||
ENDPOINT = "/wapi/zpgeek/miniapp/brand/joblist.json"
|
||||
|
||||
def __init__(
|
||||
self, *, brand_id: str, query: str = "", position_lv1: int = 0,
|
||||
city: str = "", experience: str = "", salary: str = "",
|
||||
page_size: int = 15, client: Optional[BossClient] = None,
|
||||
):
|
||||
super().__init__(page_size=page_size, http_client=client or create_client())
|
||||
self.brand_id = brand_id
|
||||
self.query = query
|
||||
self.position_lv1 = position_lv1
|
||||
self.city = city
|
||||
self.experience = experience
|
||||
self.salary = salary
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
return {
|
||||
"brandId": self.brand_id, "query": self.query,
|
||||
"page": page_index, "hasMore": "true",
|
||||
"positionLv1": self.position_lv1, "city": self.city,
|
||||
"experience": self.experience, "salary": self.salary, "appId": 10002,
|
||||
}
|
||||
|
||||
def _request(self, params: dict) -> tuple[int, Any]:
|
||||
return self._http.get(self.ENDPOINT, params)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_boss_response(http_code, raw)
|
||||
99
app/services/crawler/_boss_client.py
Normal file
99
app/services/crawler/_boss_client.py
Normal file
@ -0,0 +1,99 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
Boss直聘 HTTP 客户端
|
||||
复制自 spiderJobs/platforms/boss/client.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from app.services.crawler._http_client import HTTPClient
|
||||
from app.services.crawler._boss_sign import BossSign
|
||||
|
||||
BASE_URL = "https://www.zhipin.com"
|
||||
|
||||
BOSS_HEADERS = {
|
||||
"content-type": "application/x-www-form-urlencoded",
|
||||
"user-agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
|
||||
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
|
||||
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
|
||||
),
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
"xweb_xhr": "1",
|
||||
"zp_app_id": "10002",
|
||||
"zp_product_id": "10002",
|
||||
"ver": "14.0400",
|
||||
"mini_ver": "14.0400",
|
||||
"platform": "zhipin/mac",
|
||||
"ua": '{"model":"Mac16,8","platform":"mac"}',
|
||||
"scene": "1256",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://servicewechat.com/wxa8da525af05281f3/601/page-frame.html",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
}
|
||||
|
||||
|
||||
class BossClient(HTTPClient):
|
||||
def __init__(
|
||||
self,
|
||||
signer: Optional[BossSign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
timeout: int = 10,
|
||||
):
|
||||
super().__init__(
|
||||
base_url=BASE_URL,
|
||||
default_headers=BOSS_HEADERS,
|
||||
tunnel_proxy=tunnel_proxy,
|
||||
proxy=proxy,
|
||||
proxy_pool=proxy_pool,
|
||||
timeout=timeout,
|
||||
)
|
||||
self.signer = signer or BossSign()
|
||||
|
||||
def _boss_headers(self) -> dict:
|
||||
return {
|
||||
"mpt": self.signer.mpt,
|
||||
"wt2": self.signer.wt2,
|
||||
"Traceid": BossSign.generate_traceid("M-W"),
|
||||
}
|
||||
|
||||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
boss_h = self._boss_headers()
|
||||
if headers:
|
||||
boss_h.update(headers)
|
||||
return super().post(path, body, boss_h)
|
||||
|
||||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
boss_h = self._boss_headers()
|
||||
if headers:
|
||||
boss_h.update(headers)
|
||||
return super().get(path, params, boss_h)
|
||||
|
||||
def batch(self, sub_reqs: list[dict]) -> tuple[int, Any]:
|
||||
body = {"subReqs": sub_reqs, "appId": 10002}
|
||||
return self.post(
|
||||
"/wapi/batch/requests",
|
||||
body,
|
||||
headers={"content-type": "application/json"},
|
||||
)
|
||||
|
||||
|
||||
def create_client(
|
||||
signer: Optional[BossSign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
) -> BossClient:
|
||||
return BossClient(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|
||||
78
app/services/crawler/_boss_sign.py
Normal file
78
app/services/crawler/_boss_sign.py
Normal file
@ -0,0 +1,78 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
Boss直聘 Traceid 生成算法
|
||||
复制自 spiderJobs/platforms/boss/sign.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
|
||||
_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
|
||||
|
||||
def _to_u32(n: int) -> int:
|
||||
return n & 0xFFFFFFFF
|
||||
|
||||
|
||||
def _compute_checksum(uuid_str: str) -> str:
|
||||
r = 0
|
||||
for ch in uuid_str:
|
||||
r = ((r << 5) - r + ord(ch)) & 0xFFFFFFFF
|
||||
|
||||
a = 0
|
||||
for i in range(len(uuid_str) - 1, -1, -1):
|
||||
a = ((a << 7) - a + ord(uuid_str[i]) * (i + 1)) & 0xFFFFFFFF
|
||||
|
||||
n = 0
|
||||
mid = len(uuid_str) // 2
|
||||
for i in range(len(uuid_str)):
|
||||
n = ((n << 3) - n + ord(uuid_str[i]) * (abs(i - mid) + 1)) & 0xFFFFFFFF
|
||||
|
||||
s = _to_u32(r ^ a)
|
||||
s = _to_u32(2654435761 * s)
|
||||
s = _to_u32(s ^ (s >> 16))
|
||||
s = _to_u32(2246822507 * s)
|
||||
s = _to_u32(s ^ (s >> 13))
|
||||
c1 = _CHARS[s % 62]
|
||||
|
||||
h = _to_u32(a ^ n)
|
||||
h = _to_u32(3266489909 * h)
|
||||
h = _to_u32(h ^ (h >> 16))
|
||||
h = _to_u32(2654435761 * h)
|
||||
h = _to_u32(h ^ (h >> 13))
|
||||
c2 = _CHARS[h % 62]
|
||||
|
||||
v = _to_u32(n ^ r)
|
||||
v = _to_u32(668265261 * v)
|
||||
v = _to_u32(v ^ (v >> 16))
|
||||
v = _to_u32(2246822507 * v)
|
||||
v = _to_u32(v ^ (v >> 13))
|
||||
c3 = _CHARS[v % 62]
|
||||
|
||||
return f"{c1}{c2}{c3}"
|
||||
|
||||
|
||||
def _generate_uuid() -> str:
|
||||
hex_ts = format(int(time.time() * 1000), "x").lower()
|
||||
hex_ts = hex_ts[-13:].zfill(13)
|
||||
rand_part = "".join(random.choice(_CHARS) for _ in range(6))
|
||||
return hex_ts + rand_part
|
||||
|
||||
|
||||
class BossSign:
|
||||
def __init__(self, *, mpt: str = "", wt2: str = ""):
|
||||
self.mpt = mpt
|
||||
self.wt2 = wt2
|
||||
|
||||
@staticmethod
|
||||
def generate_traceid(prefix: str = "M-W") -> str:
|
||||
uuid_str = _generate_uuid()
|
||||
checksum = _compute_checksum(uuid_str)
|
||||
return f"{prefix}{uuid_str}{checksum}"
|
||||
128
app/services/crawler/_http_client.py
Normal file
128
app/services/crawler/_http_client.py
Normal file
@ -0,0 +1,128 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
通用 HTTP 客户端
|
||||
基于 requests-go,自带 Chrome TLS 指纹伪装
|
||||
支持代理 IP / 隧道代理 / 代理池轮换
|
||||
与任何招聘平台无关,纯粹负责发请求
|
||||
|
||||
复制自 spiderJobs/core/http_client.py — 不要直接 import spiderJobs,避免跨模块依赖
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests_go as requests
|
||||
from requests_go.tls_config import TLS_CHROME_LATEST
|
||||
|
||||
|
||||
class HTTPClient:
|
||||
"""
|
||||
通用 HTTP 客户端
|
||||
|
||||
代理优先级: tunnel_proxy > proxy_pool > proxy
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
default_headers: Optional[dict] = None,
|
||||
proxy: Optional[str] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
timeout: int = 10,
|
||||
):
|
||||
self.base_url = base_url
|
||||
self.default_headers = default_headers or {}
|
||||
self.timeout = timeout
|
||||
|
||||
self._proxy = proxy
|
||||
self._tunnel_proxy = tunnel_proxy
|
||||
self._proxy_pool = proxy_pool
|
||||
|
||||
self._session = requests.Session()
|
||||
self._session.tls_config = TLS_CHROME_LATEST
|
||||
TLS_CHROME_LATEST.random_ja3 = True
|
||||
|
||||
if proxy and not proxy_pool and not tunnel_proxy:
|
||||
self._session.proxies = {"http": proxy, "https": proxy}
|
||||
|
||||
def _new_session(self) -> requests.Session:
|
||||
s = requests.Session()
|
||||
s.tls_config = TLS_CHROME_LATEST
|
||||
TLS_CHROME_LATEST.random_ja3 = True
|
||||
return s
|
||||
|
||||
def _get_proxies(self) -> Optional[dict]:
|
||||
if self._proxy_pool:
|
||||
chosen = random.choice(self._proxy_pool)
|
||||
unique = f"{chosen}#{random.randint(100000, 999999)}"
|
||||
return {"http": unique, "https": unique}
|
||||
return None
|
||||
|
||||
def _merge_headers(self, extra: Optional[dict] = None) -> dict:
|
||||
headers = {**self.default_headers}
|
||||
if extra:
|
||||
headers.update(extra)
|
||||
return headers
|
||||
|
||||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
merged_headers = self._merge_headers(headers)
|
||||
|
||||
if self._tunnel_proxy:
|
||||
s = self._new_session()
|
||||
try:
|
||||
resp = s.post(
|
||||
f"{self.base_url}{path}",
|
||||
json=body,
|
||||
headers=merged_headers,
|
||||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
return resp.status_code, resp.json()
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"json": body,
|
||||
"headers": merged_headers,
|
||||
"timeout": self.timeout,
|
||||
}
|
||||
proxies = self._get_proxies()
|
||||
if proxies:
|
||||
kwargs["proxies"] = proxies
|
||||
resp = self._session.post(f"{self.base_url}{path}", **kwargs)
|
||||
return resp.status_code, resp.json()
|
||||
|
||||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
merged_headers = self._merge_headers(headers)
|
||||
|
||||
if self._tunnel_proxy:
|
||||
s = self._new_session()
|
||||
try:
|
||||
resp = s.get(
|
||||
f"{self.base_url}{path}",
|
||||
params=params,
|
||||
headers=merged_headers,
|
||||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
return resp.status_code, resp.json()
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"params": params,
|
||||
"headers": merged_headers,
|
||||
"timeout": self.timeout,
|
||||
}
|
||||
proxies = self._get_proxies()
|
||||
if proxies:
|
||||
kwargs["proxies"] = proxies
|
||||
resp = self._session.get(f"{self.base_url}{path}", **kwargs)
|
||||
return resp.status_code, resp.json()
|
||||
170
app/services/crawler/_job51_api.py
Normal file
170
app/services/crawler/_job51_api.py
Normal file
@ -0,0 +1,170 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
前程无忧 (51Job) - 所有 API 接口
|
||||
复制自 spiderJobs/platforms/job51/api.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from app.services.crawler._base import ApiResult, BaseFetcher, BaseSearcher
|
||||
from app.services.crawler._job51_client import Job51Client, create_client
|
||||
|
||||
|
||||
def _parse_job51_response(http_code: int, raw: Any) -> ApiResult:
|
||||
if http_code != 200:
|
||||
return ApiResult(success=False, status_code=http_code, error=f"HTTP 请求失败: {http_code}")
|
||||
|
||||
if not isinstance(raw, dict):
|
||||
return ApiResult(success=False, status_code=http_code, error="响应格式异常")
|
||||
|
||||
biz_status = raw.get("status")
|
||||
if biz_status is not None and str(biz_status) != "1":
|
||||
return ApiResult(
|
||||
success=False,
|
||||
status_code=int(biz_status) if str(biz_status).isdigit() else -1,
|
||||
error=raw.get("message") or f"业务错误: {biz_status}",
|
||||
)
|
||||
|
||||
payload = raw.get("resultbody") or raw.get("data") or {}
|
||||
|
||||
if isinstance(payload, dict) and "jobList" in payload:
|
||||
job_list_wrap = payload.get("jobList", {})
|
||||
if isinstance(job_list_wrap, dict) and "items" in job_list_wrap:
|
||||
items = job_list_wrap.get("items", [])
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=items, count=len(items), is_end_page=len(items) == 0,
|
||||
)
|
||||
if isinstance(job_list_wrap, list):
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=job_list_wrap, count=len(job_list_wrap), is_end_page=len(job_list_wrap) == 0,
|
||||
)
|
||||
|
||||
if isinstance(payload, dict) and "items" in payload:
|
||||
items = payload.get("items", [])
|
||||
total = payload.get("totalCount", len(items))
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=items, count=total, is_end_page=len(items) == 0,
|
||||
)
|
||||
|
||||
if isinstance(payload, dict) and "list" in payload:
|
||||
items = payload.get("list", [])
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=items, count=len(items), is_end_page=len(items) == 0,
|
||||
)
|
||||
|
||||
return ApiResult(success=True, status_code=200, data=payload)
|
||||
|
||||
|
||||
class SearchRecommendJobs(BaseSearcher):
|
||||
ENDPOINT = "open/noauth/recommend/job-tab-dynamic-wx-mini"
|
||||
|
||||
def __init__(
|
||||
self, *, job_area: str = "020000", function_type: str = "",
|
||||
job_type: str = "recommend", page_size: int = 10,
|
||||
client: Optional[Job51Client] = None,
|
||||
):
|
||||
super().__init__(page_size=page_size, http_client=client or create_client())
|
||||
self.job_area = job_area
|
||||
self.function_type = function_type
|
||||
self.job_type = job_type
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
body = {
|
||||
"pageNo": page_index, "pageSize": self.page_size,
|
||||
"specialPageCode": True, "isTouristMode": True,
|
||||
"type": self.job_type, "jobArea": self.job_area, "personAsLabel": "1",
|
||||
}
|
||||
if self.function_type:
|
||||
body["functionType"] = self.function_type
|
||||
return body
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_job51_response(http_code, raw)
|
||||
|
||||
|
||||
class GetJobDetail(BaseFetcher):
|
||||
ENDPOINT = "open/noauth/jobs/detail/base"
|
||||
|
||||
def __init__(self, *, job_id: str, client: Optional[Job51Client] = None):
|
||||
super().__init__(http_client=client or create_client())
|
||||
self.job_id = job_id
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {}
|
||||
|
||||
def fetch(self) -> ApiResult:
|
||||
endpoint = f"{self.ENDPOINT}/{self.job_id}"
|
||||
try:
|
||||
http_code, data = self._http.get(endpoint)
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_job51_response(http_code, raw)
|
||||
|
||||
|
||||
class GetCompanyInfo(BaseFetcher):
|
||||
ENDPOINT = "open/noauth/company-info/info-data"
|
||||
|
||||
def __init__(
|
||||
self, *, company_id: str, color_one: str = "#ffffff",
|
||||
color_two: str = "#ffffffcc", client: Optional[Job51Client] = None,
|
||||
):
|
||||
super().__init__(http_client=client or create_client())
|
||||
self.company_id = company_id
|
||||
self.color_one = color_one
|
||||
self.color_two = color_two
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {
|
||||
"companyId": self.company_id,
|
||||
"colorOne": self.color_one,
|
||||
"colorTwo": self.color_two,
|
||||
}
|
||||
|
||||
def fetch(self) -> ApiResult:
|
||||
try:
|
||||
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_job51_response(http_code, raw)
|
||||
|
||||
|
||||
class SearchCompanyJobs(BaseSearcher):
|
||||
ENDPOINT = "open/noauth/jobs/company"
|
||||
|
||||
def __init__(
|
||||
self, *, company_id: str, job_area: str = "", function: str = "",
|
||||
salary_type: str = "", page_size: int = 10,
|
||||
client: Optional[Job51Client] = None,
|
||||
):
|
||||
super().__init__(page_size=page_size, http_client=client or create_client())
|
||||
self.company_id = company_id
|
||||
self.job_area = job_area
|
||||
self.function = function
|
||||
self.salary_type = salary_type
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
return {
|
||||
"pageNum": page_index, "pageSize": self.page_size,
|
||||
"coId": self.company_id, "jobArea": self.job_area,
|
||||
"function": self.function, "salaryType": self.salary_type,
|
||||
"scene": 14, "requestId": "",
|
||||
}
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return _parse_job51_response(http_code, raw)
|
||||
137
app/services/crawler/_job51_client.py
Normal file
137
app/services/crawler/_job51_client.py
Normal file
@ -0,0 +1,137 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
前程无忧 (51Job) HTTP 客户端
|
||||
复制自 spiderJobs/platforms/job51/client.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any, Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
from app.services.crawler._http_client import HTTPClient
|
||||
from app.services.crawler._job51_sign import Job51Sign
|
||||
|
||||
BASE_URL = "https://cupid.51job.com"
|
||||
|
||||
JOB51_HEADERS = {
|
||||
"user-agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
|
||||
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
|
||||
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
|
||||
),
|
||||
"xweb_xhr": "1",
|
||||
"from-domain": "51job_weixin_wxapp",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://servicewechat.com/wx1131e5c71e668b5d/426/page-frame.html",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
}
|
||||
|
||||
|
||||
class Job51Client(HTTPClient):
|
||||
def __init__(
|
||||
self,
|
||||
signer: Optional[Job51Sign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
timeout: int = 10,
|
||||
):
|
||||
super().__init__(
|
||||
base_url=BASE_URL,
|
||||
default_headers=JOB51_HEADERS,
|
||||
tunnel_proxy=tunnel_proxy,
|
||||
proxy=proxy,
|
||||
proxy_pool=proxy_pool,
|
||||
timeout=timeout,
|
||||
)
|
||||
self.signer = signer or Job51Sign()
|
||||
self._uuid = Job51Sign.generate_uuid()
|
||||
|
||||
def _job51_headers(self, sign: str) -> dict:
|
||||
property_obj = {
|
||||
"frompageUrl": "",
|
||||
"pageUrl": "pages/index/index",
|
||||
"isLogin": "否",
|
||||
"accountid": "",
|
||||
"resumeId": "",
|
||||
"firstFrompageUrl": "",
|
||||
"distinct_id": self._uuid,
|
||||
}
|
||||
return {
|
||||
"sign": sign,
|
||||
"partner": "",
|
||||
"property": quote(json.dumps(property_obj, ensure_ascii=False, separators=(",", ":")), safe=""),
|
||||
"uuid": self._uuid,
|
||||
"user-token": "",
|
||||
"account-id": "",
|
||||
}
|
||||
|
||||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
url_path, sign = self.signer.build_sign_path(path, "POST", body=body)
|
||||
|
||||
job51_h = self._job51_headers(sign)
|
||||
job51_h["Content-Type"] = "application/json"
|
||||
if headers:
|
||||
job51_h.update(headers)
|
||||
|
||||
raw_body = json.dumps(body, ensure_ascii=False, separators=(",", ":"))
|
||||
return self._post_raw(url_path, raw_body, job51_h)
|
||||
|
||||
def _post_raw(self, path: str, raw_body: str, headers: dict) -> tuple[int, Any]:
|
||||
merged_headers = self._merge_headers(headers)
|
||||
url = f"{self.base_url}{path}"
|
||||
|
||||
if self._tunnel_proxy:
|
||||
s = self._new_session()
|
||||
try:
|
||||
resp = s.post(
|
||||
url,
|
||||
data=raw_body.encode("utf-8"),
|
||||
headers=merged_headers,
|
||||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
return resp.status_code, resp.json()
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
proxies = self._get_proxies()
|
||||
kwargs: dict[str, Any] = {
|
||||
"data": raw_body.encode("utf-8"),
|
||||
"headers": merged_headers,
|
||||
"timeout": self.timeout,
|
||||
}
|
||||
if proxies:
|
||||
kwargs["proxies"] = proxies
|
||||
resp = self._session.post(url, **kwargs)
|
||||
return resp.status_code, resp.json()
|
||||
|
||||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
url_path, sign = self.signer.build_sign_path(path, "GET", params=params)
|
||||
|
||||
job51_h = self._job51_headers(sign)
|
||||
job51_h["content-type"] = "application/x-www-form-urlencoded"
|
||||
if headers:
|
||||
job51_h.update(headers)
|
||||
|
||||
return super().get(url_path, params=None, headers=job51_h)
|
||||
|
||||
|
||||
def create_client(
|
||||
signer: Optional[Job51Sign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
) -> Job51Client:
|
||||
return Job51Client(signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|
||||
62
app/services/crawler/_job51_sign.py
Normal file
62
app/services/crawler/_job51_sign.py
Normal file
@ -0,0 +1,62 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
前程无忧 (51Job) 签名算法
|
||||
复制自 spiderJobs/platforms/job51/sign.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import hmac
|
||||
import hashlib
|
||||
import time
|
||||
import random
|
||||
from urllib.parse import quote
|
||||
|
||||
|
||||
SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||||
|
||||
|
||||
class Job51Sign:
|
||||
def __init__(self, *, sign_key: str = SIGN_KEY):
|
||||
self.sign_key = sign_key
|
||||
|
||||
@staticmethod
|
||||
def generate_uuid() -> str:
|
||||
ts = str(int(time.time() * 1000))
|
||||
rand = str(random.randint(1000000000, 9999999999))
|
||||
return ts + rand
|
||||
|
||||
def build_sign_path(
|
||||
self,
|
||||
endpoint: str,
|
||||
method: str = "GET",
|
||||
params: dict | None = None,
|
||||
body: dict | None = None,
|
||||
) -> tuple[str, str]:
|
||||
import json
|
||||
|
||||
ts = int(time.time())
|
||||
path = f"/{endpoint}?api_key=51job×tamp={ts}"
|
||||
|
||||
if method.upper() == "GET" and params:
|
||||
query_parts = []
|
||||
for k, v in params.items():
|
||||
query_parts.append(f"{quote(str(k), safe='')}={quote(str(v), safe='')}")
|
||||
if query_parts:
|
||||
path += "&" + "&".join(query_parts)
|
||||
|
||||
message = path
|
||||
if method.upper() == "POST" and body is not None:
|
||||
message += json.dumps(body, ensure_ascii=False, separators=(",", ":"))
|
||||
|
||||
sign_hex = hmac.new(
|
||||
self.sign_key.encode("utf-8"),
|
||||
message.encode("utf-8"),
|
||||
hashlib.sha256,
|
||||
).hexdigest()
|
||||
|
||||
return path, sign_hex
|
||||
148
app/services/crawler/_zhilian_api.py
Normal file
148
app/services/crawler/_zhilian_api.py
Normal file
@ -0,0 +1,148 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
智联招聘 - 所有 API 接口
|
||||
复制自 spiderJobs/platforms/zhilian/api.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from app.services.crawler._base import BaseFetcher, BaseSearcher
|
||||
from app.services.crawler._zhilian_client import ZhilianClient, create_cgate_client, create_capi_client
|
||||
|
||||
|
||||
_SEARCH_BODY = {
|
||||
"eventScenario": "wxmpZhaopinSearchV2",
|
||||
"filterMinSalary": 1,
|
||||
"S_SOU_EXPAND": "SOU_COMPANY_ID",
|
||||
"sortType": "DEFAULT",
|
||||
"resumeNumber": "",
|
||||
"version": "8.11.22",
|
||||
"identity": 0,
|
||||
"anonymous": 1,
|
||||
}
|
||||
|
||||
_FILTER_KEYS = [
|
||||
"S_SOU_SALARY", "S_SOU_EDUCATION_LOWESTLEVEL", "S_SOU_REFRESH_DATE",
|
||||
"S_SOU_WORK_EXPERIENCE", "S_SOU_POSITION_TYPE", "S_SOU_COMPANY_TYPE",
|
||||
"S_SOU_COMPANY_SCALE", "welfareLabels", "S_SOU_JD_INDUSTRY_LEVEL",
|
||||
]
|
||||
|
||||
|
||||
class SearchPositions(BaseSearcher):
|
||||
ENDPOINT = "/positionbusiness/searchrecommend/searchPositions"
|
||||
|
||||
def __init__(
|
||||
self, *, keyword: str = "", city_code: int | str = "",
|
||||
collected_purpose: Optional[dict] = None,
|
||||
filters: Optional[dict] = None, page_size: int = 15,
|
||||
client: Optional[ZhilianClient] = None,
|
||||
):
|
||||
super().__init__(page_size=page_size, http_client=client or create_cgate_client())
|
||||
self.keyword = keyword
|
||||
self.city_code = city_code
|
||||
self.collected_purpose = collected_purpose
|
||||
self.filters = filters or {}
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
body = {**_SEARCH_BODY, "pageIndex": page_index, "pageSize": self.page_size}
|
||||
if self.collected_purpose:
|
||||
body.update(self._purpose_params(self.collected_purpose, page_index))
|
||||
if self.keyword and "S_SOU_JD_JOB_LEVEL3" not in body:
|
||||
body["S_SOU_FULL_INDEX"] = self.keyword
|
||||
if self.city_code and "S_SOU_WORK_CITY" not in body:
|
||||
body["S_SOU_WORK_CITY"] = self.city_code
|
||||
body.update({k: self.filters[k] for k in _FILTER_KEYS if self.filters.get(k)})
|
||||
return body
|
||||
|
||||
@staticmethod
|
||||
def _purpose_params(purpose: dict, page_index: int) -> dict:
|
||||
params: dict = {"pageIndex": page_index}
|
||||
pnew = purpose.get("pnew_preferred_job_type", "")
|
||||
name = purpose.get("job_type_name", "")
|
||||
if pnew:
|
||||
params["S_SOU_JD_JOB_LEVEL3"] = pnew
|
||||
elif name:
|
||||
params["S_SOU_FULL_INDEX"] = name
|
||||
city = purpose.get("city_id", "") or purpose.get("preferred_location", "")
|
||||
if city:
|
||||
params["S_SOU_WORK_CITY"] = city
|
||||
sal_min = purpose.get("preferred_salary_min", "")
|
||||
sal_max = purpose.get("preferred_salary_max", "")
|
||||
if sal_min not in ("", "-1") or sal_max != "":
|
||||
params["S_SOU_SALARY"] = f"{sal_min},{sal_max}"
|
||||
return params
|
||||
|
||||
|
||||
class GetPositionDetail(BaseFetcher):
|
||||
ENDPOINT = "/positionbusiness/position/getPositionModule"
|
||||
|
||||
def __init__(self, *, number: str, identity: int = 0, client: Optional[ZhilianClient] = None):
|
||||
super().__init__(http_client=client or create_cgate_client())
|
||||
self.number = number
|
||||
self.identity = identity
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {"number": self.number, "identity": self.identity, "resumeNumber": ""}
|
||||
|
||||
|
||||
class GetCompanyExtDetail(BaseFetcher):
|
||||
ENDPOINT = "/riskstorm/company/getCompanyExtDetail"
|
||||
|
||||
def __init__(self, *, company_name: str, company_number: str, client: Optional[ZhilianClient] = None):
|
||||
super().__init__(http_client=client or create_cgate_client())
|
||||
self.company_name = company_name
|
||||
self.company_number = company_number
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {"companyName": self.company_name, "companyNumber": self.company_number}
|
||||
|
||||
|
||||
class GetCompanyDetail(BaseFetcher):
|
||||
ENDPOINT = "/positionbusiness/exposure/companyDetail"
|
||||
|
||||
def __init__(self, *, number: str, client: Optional[ZhilianClient] = None):
|
||||
super().__init__(http_client=client or create_cgate_client())
|
||||
self.number = number
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
return {"number": self.number}
|
||||
|
||||
|
||||
class SearchCompanyPositions(BaseSearcher):
|
||||
ENDPOINT = "/capi/searchrecommend/searchPositionsCompany"
|
||||
|
||||
def __init__(
|
||||
self, *, company_id: str, job_level: str = "",
|
||||
city_code: str = "", page_size: int = 30,
|
||||
client: Optional[ZhilianClient] = None,
|
||||
):
|
||||
self._client = client or create_capi_client()
|
||||
super().__init__(page_size=page_size, http_client=self._client)
|
||||
self.company_id = company_id
|
||||
self.job_level = job_level
|
||||
self.city_code = city_code
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
params = {**self._client.signer.sign_params()}
|
||||
params.update({
|
||||
"S_SOU_COMPANY_ID": self.company_id,
|
||||
"S_SOU_POSITION_SOURCE_TYPE": "1",
|
||||
"eventScenario": "wxmpZhaopinSearchPositionsCompany",
|
||||
"pageCode": "wxmpZhaopinCompanyDetailPage",
|
||||
"pageIndex": page_index,
|
||||
"pageSize": self.page_size,
|
||||
})
|
||||
if self.job_level:
|
||||
params["S_SOU_JD_JOB_LEVEL"] = self.job_level
|
||||
if self.city_code:
|
||||
params["S_SOU_WORK_CITY"] = self.city_code
|
||||
return params
|
||||
|
||||
def _request(self, params: dict) -> tuple[int, Any]:
|
||||
return self._http.get(self.ENDPOINT, params)
|
||||
84
app/services/crawler/_zhilian_client.py
Normal file
84
app/services/crawler/_zhilian_client.py
Normal file
@ -0,0 +1,84 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
智联招聘 HTTP 客户端
|
||||
复制自 spiderJobs/platforms/zhilian/client.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from app.services.crawler._http_client import HTTPClient
|
||||
from app.services.crawler._zhilian_sign import ZhilianSign
|
||||
|
||||
CGATE_BASE_URL = "https://cgate.zhaopin.com"
|
||||
CAPI_BASE_URL = "https://capi.zhaopin.com"
|
||||
|
||||
ZHILIAN_HEADERS = {
|
||||
"content-type": "application/json",
|
||||
"user-agent": (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI "
|
||||
"MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.7(0x13080712) "
|
||||
"UnifiedPCMacWechat(0xf2641702) XWEB/18788"
|
||||
),
|
||||
"accept": "*/*",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/647/page-frame.html",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"accept-encoding": "identity",
|
||||
}
|
||||
|
||||
|
||||
class ZhilianClient(HTTPClient):
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str = CGATE_BASE_URL,
|
||||
signer: Optional[ZhilianSign] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
timeout: int = 10,
|
||||
):
|
||||
super().__init__(
|
||||
base_url=base_url,
|
||||
default_headers=ZHILIAN_HEADERS,
|
||||
proxy=proxy,
|
||||
proxy_pool=proxy_pool,
|
||||
timeout=timeout,
|
||||
)
|
||||
self.signer = signer or ZhilianSign()
|
||||
|
||||
def post(self, path: str, body: dict, headers: Optional[dict] = None, page_code: str = "0") -> tuple[int, Any]:
|
||||
sign_headers = self.signer.sign_headers(page_code)
|
||||
if headers:
|
||||
sign_headers.update(headers)
|
||||
return super().post(path, body, sign_headers)
|
||||
|
||||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None, page_code: str = "0") -> tuple[int, Any]:
|
||||
sign_headers = self.signer.sign_headers(page_code)
|
||||
if headers:
|
||||
sign_headers.update(headers)
|
||||
return super().get(path, params, sign_headers)
|
||||
|
||||
|
||||
def create_cgate_client(
|
||||
signer: Optional[ZhilianSign] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
) -> ZhilianClient:
|
||||
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
|
||||
|
||||
|
||||
def create_capi_client(
|
||||
signer: Optional[ZhilianSign] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
) -> ZhilianClient:
|
||||
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
|
||||
63
app/services/crawler/_zhilian_sign.py
Normal file
63
app/services/crawler/_zhilian_sign.py
Normal file
@ -0,0 +1,63 @@
|
||||
# ⚠️ DEPRECATED — 2026-03-21
|
||||
# 此文件是内部手工复制文件,已废弃,不再由任何 facade 引用。
|
||||
# 请改用 spiderJobs.platforms.* 或 crawler_core 中的对应模块。
|
||||
# 将在下一里程碑中删除。
|
||||
#
|
||||
"""
|
||||
智联招聘签名算法
|
||||
复制自 spiderJobs/platforms/zhilian/sign.py — import 改为本地引用
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
import random
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class ZhilianSign:
|
||||
def __init__(
|
||||
self, *, at: str = "", rt: str = "",
|
||||
device_id: Optional[str] = None, version: str = "4.1.259",
|
||||
channel: str = "wxxiaochengxu", platform: str = "12",
|
||||
):
|
||||
self.at = at
|
||||
self.rt = rt
|
||||
self.device_id = device_id or self.generate_uuid()
|
||||
self.version = version
|
||||
self.channel = channel
|
||||
self.platform = platform
|
||||
|
||||
@staticmethod
|
||||
def generate_uuid() -> str:
|
||||
chars = "0123456789ABCDEF"
|
||||
uuid = [""] * 36
|
||||
for i in range(36):
|
||||
uuid[i] = chars[math.floor(16 * random.random())]
|
||||
uuid[14] = "4"
|
||||
uuid[19] = chars[(int(uuid[19], 16) & 0x3) | 0x8]
|
||||
uuid[8] = uuid[13] = uuid[18] = uuid[23] = "-"
|
||||
return "".join(uuid)
|
||||
|
||||
def sign_headers(self, page_code: str = "0") -> dict:
|
||||
return {
|
||||
"x-zp-at": self.at,
|
||||
"x-zp-rt": self.rt,
|
||||
"x-zp-action-id": self.generate_uuid(),
|
||||
"x-zp-page-code": page_code,
|
||||
"x-zp-version": self.version,
|
||||
"x-zp-channel": self.channel,
|
||||
"x-zp-platform": self.platform,
|
||||
"x-zp-device-id": self.device_id,
|
||||
"x-zp-business-system": "73",
|
||||
}
|
||||
|
||||
def sign_params(self) -> dict:
|
||||
return {
|
||||
"at": self.at,
|
||||
"rt": self.rt,
|
||||
"channel": self.channel,
|
||||
"platform": self.platform,
|
||||
"version": self.version,
|
||||
"d": self.device_id,
|
||||
}
|
||||
@ -1,372 +1,139 @@
|
||||
import requests
|
||||
import time
|
||||
import json
|
||||
import uuid
|
||||
from typing import Dict, Any, Optional, List
|
||||
from app.core.algorithms.antispider import IPStrategyConfig, IPAnomalyDetector, SmartIPManager, generate_boss_trace_id, generate_token
|
||||
"""
|
||||
Boss直聘 Service — 基于新算法文件的封装
|
||||
保持对外公开接口不变(cleaning.py / company_cleaner.py 依赖)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from loguru import logger
|
||||
import os
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from spiderJobs.platforms.boss.api import (
|
||||
GetBrandDetail,
|
||||
GetJobDetail,
|
||||
SearchBrandJobs,
|
||||
SearchRecJobs,
|
||||
)
|
||||
from spiderJobs.platforms.boss.client import BossClient, create_client
|
||||
from spiderJobs.platforms.boss.sign import BossSign
|
||||
|
||||
|
||||
class BossService:
|
||||
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]] = None):
|
||||
self.app_id = 10002
|
||||
self.zp_product_id = 10002
|
||||
self.serve_domain = "https://www.zhipin.com"
|
||||
self.api_domain = "https://wxapp.zhipin.com"
|
||||
self._signer = BossSign()
|
||||
proxy = None
|
||||
if proxy_pool:
|
||||
proxy = proxy_pool[0].get("https") or proxy_pool[0].get("http") if proxy_pool else None
|
||||
self._client = create_client(signer=self._signer, proxy=proxy)
|
||||
|
||||
self.session = requests.Session()
|
||||
self.session.trust_env = False
|
||||
self.session.headers.update({'no_proxy': '10.0.0.0/16,example.com,.example.com'})
|
||||
|
||||
# Initialize IP Strategy
|
||||
self.ip_cfg = IPStrategyConfig()
|
||||
self.ip_detector = IPAnomalyDetector(self.ip_cfg)
|
||||
self.ip_manager = SmartIPManager(proxy_pool, self.ip_cfg)
|
||||
|
||||
# Initial route
|
||||
route_mode, route_cfg = self.ip_manager.current_route()
|
||||
if route_mode == 'proxy' and route_cfg:
|
||||
self.session.proxies = route_cfg
|
||||
|
||||
self.device_id = str(uuid.uuid4())
|
||||
self.wx_version = "8.0.43"
|
||||
self.mini_version = "1.0.0"
|
||||
self.scene = 1001
|
||||
|
||||
self.default_headers = {
|
||||
"Accept": "*/*",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Connection": "keep-alive",
|
||||
"Content-Type": "application/x-www-form-urlencoded",
|
||||
"Host": "www.zhipin.com",
|
||||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/571/page-frame.html",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
|
||||
"X-Requested-With": "XMLHttpRequest",
|
||||
"platform": "zhipin/mac",
|
||||
"zp_app_id": str(self.app_id),
|
||||
"ver": "100.0000",
|
||||
"mini_ver": "100.0000",
|
||||
"ua": json.dumps({"model": "Mac16,8", "platform": "mac"}),
|
||||
"zp_product_id": str(self.zp_product_id),
|
||||
"scene": "1006",
|
||||
"xweb_xhr": "1",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty"
|
||||
}
|
||||
|
||||
self.login_data = {
|
||||
"mpt": "", # Needs to be filled via login/token logic if required
|
||||
# login_data 用于外部检查(cleaning.py 通过 boss_service.login_data.get("mpt") 判断)
|
||||
self.login_data: Dict[str, str] = {
|
||||
"mpt": "",
|
||||
"wt2": "",
|
||||
"openId": "",
|
||||
"traceid": "F-77d05bnXuMVrHIB3"
|
||||
}
|
||||
|
||||
self.current_token_id: Optional[int] = None
|
||||
self.init_cookies()
|
||||
|
||||
def init_cookies(self):
|
||||
cookies = {
|
||||
'__zp_stoken__': generate_token(),
|
||||
'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
|
||||
'Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a': str(int(time.time())),
|
||||
'__c': self.device_id[:8],
|
||||
'__g': '-',
|
||||
'__l': 'l=%2Fwww.zhipin.com%2F&r=&friend_source=0&s=3&friend_source=0',
|
||||
'lastCity': '101010100',
|
||||
'cityName': '%E5%8C%97%E4%BA%AC',
|
||||
'__zp_sseed__': 'btHZ0bjBq8m//WNwlVrPUnVcIvini5J5P5LQUbflM24=',
|
||||
'__zp_sname__': '3998243a',
|
||||
'__zp_sts__': str(int(time.time() * 1000))
|
||||
}
|
||||
|
||||
for name, value in cookies.items():
|
||||
self.session.cookies.set(name, value, domain='.zhipin.com')
|
||||
|
||||
def set_login_data(self, mpt: str, wt2: str, open_id: str = "") -> None:
|
||||
self.login_data.update(
|
||||
{
|
||||
"mpt": mpt,
|
||||
"wt2": wt2,
|
||||
"openId": open_id,
|
||||
}
|
||||
)
|
||||
if wt2:
|
||||
self.session.cookies.set("wt2", wt2, domain=".zhipin.com")
|
||||
if mpt:
|
||||
self.session.cookies.set("mpt", mpt, domain=".zhipin.com")
|
||||
self.login_data.update({"mpt": mpt, "wt2": wt2, "openId": open_id})
|
||||
self._signer.mpt = mpt
|
||||
self._signer.wt2 = wt2
|
||||
|
||||
def set_proxy(self, proxy: Optional[str]) -> None:
|
||||
if not proxy:
|
||||
self.session.proxies = {}
|
||||
route_mode, route_cfg = self.ip_manager.current_route()
|
||||
if route_mode == "proxy" and route_cfg:
|
||||
self.session.proxies = route_cfg
|
||||
logger.info("BossService proxy reset to default route")
|
||||
return
|
||||
proxy = proxy.strip().strip("`")
|
||||
proxies = {"http": proxy, "https": proxy}
|
||||
self.session.proxies = proxies
|
||||
logger.info(f"BossService using user proxy: {proxies}")
|
||||
if proxy:
|
||||
proxy = proxy.strip().strip("`")
|
||||
self._client = create_client(signer=self._signer, proxy=proxy or None)
|
||||
logger.info(f"BossService proxy set to: {proxy or 'direct'}")
|
||||
|
||||
def build_request_headers(self, custom_headers: Optional[Dict] = None) -> Dict[str, str]:
|
||||
headers = self.default_headers.copy()
|
||||
headers.update({
|
||||
"mpt": self.login_data.get("mpt", ""),
|
||||
"scene": "1006",
|
||||
"wt2": self.login_data.get("wt2", ""),
|
||||
"Traceid": generate_boss_trace_id()
|
||||
})
|
||||
headers["timestamp"] = str(int(time.time() * 1000))
|
||||
if custom_headers:
|
||||
headers.update(custom_headers)
|
||||
return headers
|
||||
|
||||
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
||||
return headers
|
||||
|
||||
def _log_request_response(
|
||||
self,
|
||||
label: str,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: Dict[str, Any],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None,
|
||||
response: Optional[requests.Response] = None,
|
||||
) -> None:
|
||||
safe_headers = self._sanitize_headers(headers)
|
||||
current_proxies = getattr(self.session, "proxies", None)
|
||||
proxy_info = current_proxies if current_proxies else {}
|
||||
login_flags = {
|
||||
"mpt_set": bool(self.login_data.get("mpt")),
|
||||
"wt2_set": bool(self.login_data.get("wt2")),
|
||||
}
|
||||
logger.info(
|
||||
f"[Boss-{label}] request method={method} url={url} headers={safe_headers} "
|
||||
f"params={params} json={json_body} proxies={proxy_info} login={login_flags}"
|
||||
)
|
||||
try:
|
||||
curl_url = url
|
||||
if params and isinstance(params, dict):
|
||||
query_string = urlencode(params)
|
||||
if query_string:
|
||||
separator = "&" if "?" in curl_url else "?"
|
||||
curl_url = f"{curl_url}{separator}{query_string}"
|
||||
header_parts = []
|
||||
for k, v in safe_headers.items():
|
||||
v_str = str(v).replace("'", "'\"'\"'")
|
||||
header_parts.append(f"-H '{k}: {v_str}'")
|
||||
data_part = ""
|
||||
if json_body is not None:
|
||||
body_str = json.dumps(json_body, ensure_ascii=False)
|
||||
body_str = body_str.replace("'", "'\"'\"'")
|
||||
data_part = f" --data '{body_str}'"
|
||||
curl_cmd = f"curl -X {method} '{curl_url}' " + " ".join(header_parts) + data_part
|
||||
logger.info(f"[Boss-{label}] curl_debug {curl_cmd}")
|
||||
except Exception as e:
|
||||
logger.debug(f"[Boss-{label}] build curl error: {e}")
|
||||
if response is not None:
|
||||
text_sample = ""
|
||||
try:
|
||||
body = response.text or ""
|
||||
text_sample = body[:1000]
|
||||
except Exception:
|
||||
text_sample = "<unreadable>"
|
||||
logger.info(
|
||||
f"[Boss-{label}] response status={response.status_code} "
|
||||
f"headers={self._sanitize_headers(dict(response.headers))} "
|
||||
f"body_sample={text_sample}"
|
||||
)
|
||||
|
||||
def build_request_data(self, data: Optional[Dict] = None) -> Dict[str, Any]:
|
||||
request_data = {
|
||||
"appId": self.app_id,
|
||||
"scene": self.scene,
|
||||
"timestamp": int(time.time() * 1000)
|
||||
}
|
||||
if data:
|
||||
request_data.update(data)
|
||||
return request_data
|
||||
|
||||
def get_job_detail_by_id(self, job_id: str, lid: str = "", security_id: str = "") -> Optional[Dict]:
|
||||
def get_job_detail_by_id(
|
||||
self, job_id: str, lid: str = "", security_id: str = ""
|
||||
) -> Optional[Dict]:
|
||||
"""根据招聘ID获取招聘详情"""
|
||||
logger.info(f"🔍 获取招聘详情: {job_id}")
|
||||
|
||||
# Batch request simulation
|
||||
sub_reqs = [
|
||||
{
|
||||
"path": "/wapi/zpgeek/miniapp/job/detail.json",
|
||||
"method": "GET",
|
||||
"query": urlencode({
|
||||
"securityId": security_id,
|
||||
"jobId": job_id,
|
||||
"lid": lid,
|
||||
"source": "10"
|
||||
})
|
||||
},
|
||||
{
|
||||
"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json",
|
||||
"method": "GET",
|
||||
"query": urlencode({
|
||||
"securityId": security_id,
|
||||
"jobId": job_id,
|
||||
"lid": lid
|
||||
})
|
||||
}
|
||||
]
|
||||
|
||||
post_data = {
|
||||
"subReqs": sub_reqs,
|
||||
"appId": 10002
|
||||
}
|
||||
|
||||
headers = self.build_request_headers({
|
||||
"Content-Type": "application/json",
|
||||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/585/page-frame.html"
|
||||
})
|
||||
|
||||
logger.info(f"获取招聘详情: {job_id}")
|
||||
try:
|
||||
response = self.session.post(
|
||||
"https://www.zhipin.com/wapi/batch/requests",
|
||||
json=post_data,
|
||||
headers=headers,
|
||||
timeout=30
|
||||
fetcher = GetJobDetail(
|
||||
security_id=security_id, job_id=job_id, lid=lid,
|
||||
client=self._client,
|
||||
)
|
||||
self._log_request_response(
|
||||
"job-detail",
|
||||
"POST",
|
||||
"https://www.zhipin.com/wapi/batch/requests",
|
||||
headers,
|
||||
params=None,
|
||||
json_body=post_data,
|
||||
response=response,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
|
||||
# Extract relevant part from batch response
|
||||
if data.get("code") == 0 and data.get("zpData"):
|
||||
# Simplification: return the whole structure or extract job detail
|
||||
# Usually we want the job detail part
|
||||
job_detail_path = "/wapi/zpgeek/miniapp/job/detail.json"
|
||||
if job_detail_path in data["zpData"]:
|
||||
return data["zpData"][job_detail_path]
|
||||
return data
|
||||
result = fetcher.fetch()
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Boss get_job_detail failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch job detail: {e}")
|
||||
logger.error(f"Boss get_job_detail exception: {e}")
|
||||
return None
|
||||
|
||||
def get_company_detail_by_id(self, company_id: str) -> Optional[Dict]:
|
||||
"""根据公司ID获取公司详情"""
|
||||
logger.info(f"🏢 获取公司详情: {company_id}")
|
||||
params = {
|
||||
"brandId": company_id,
|
||||
"appId": "10002"
|
||||
}
|
||||
headers = self.build_request_headers({
|
||||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/574/page-frame.html"
|
||||
})
|
||||
|
||||
logger.info(f"获取公司详情: {company_id}")
|
||||
try:
|
||||
request_data = self.build_request_data(params)
|
||||
response = self.session.get(
|
||||
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/detail.json",
|
||||
headers=headers,
|
||||
params=request_data,
|
||||
timeout=30
|
||||
)
|
||||
self._log_request_response(
|
||||
"company-detail",
|
||||
"GET",
|
||||
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/detail.json",
|
||||
headers,
|
||||
params=request_data,
|
||||
json_body=None,
|
||||
response=response,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
fetcher = GetBrandDetail(brand_id=company_id, client=self._client)
|
||||
result = fetcher.fetch()
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Boss get_company_detail failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch company detail: {e}")
|
||||
logger.error(f"Boss get_company_detail exception: {e}")
|
||||
return None
|
||||
|
||||
def get_company_jobs_by_id(self, company_id: str, page: int = 1) -> Optional[Dict]:
|
||||
def get_company_jobs_by_id(
|
||||
self, company_id: str, page: int = 1
|
||||
) -> Optional[Dict]:
|
||||
"""根据公司ID获取该公司职位列表"""
|
||||
logger.info(f"📄 获取公司职位列表: {company_id}, page={page}")
|
||||
params = {
|
||||
"brandId": company_id,
|
||||
"query": "",
|
||||
"page": page,
|
||||
"hasMore": "true",
|
||||
"positionLv1": 0,
|
||||
"city": "",
|
||||
"experience": "",
|
||||
"salary": "",
|
||||
"appId": "10002",
|
||||
}
|
||||
headers = self.build_request_headers({
|
||||
"Referer": "https://servicewechat.com/wxa8da525af05281f3/587/page-frame.html"
|
||||
})
|
||||
|
||||
logger.info(f"获取公司职位列表: {company_id}, page={page}")
|
||||
try:
|
||||
request_data = self.build_request_data(params)
|
||||
response = self.session.get(
|
||||
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/joblist.json",
|
||||
headers=headers,
|
||||
params=request_data,
|
||||
timeout=30,
|
||||
searcher = SearchBrandJobs(
|
||||
brand_id=company_id, page_size=15, client=self._client,
|
||||
)
|
||||
self._log_request_response(
|
||||
"company-joblist",
|
||||
"GET",
|
||||
f"{self.serve_domain}/wapi/zpgeek/miniapp/brand/joblist.json",
|
||||
headers,
|
||||
params=request_data,
|
||||
json_body=None,
|
||||
response=response,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
result = searcher.search(page_index=page)
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Boss get_company_jobs failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to fetch company job list: {e}")
|
||||
logger.error(f"Boss get_company_jobs exception: {e}")
|
||||
return None
|
||||
|
||||
def search_jobs(self, keyword: str, city_code: str = "101010100", page: int = 1) -> Optional[Dict]:
|
||||
def search_jobs(
|
||||
self, keyword: str, city_code: str = "101010100", page: int = 1
|
||||
) -> Optional[Dict]:
|
||||
"""搜索职位"""
|
||||
params = {
|
||||
'pageSize': 15,
|
||||
'query': keyword,
|
||||
'city': city_code,
|
||||
'page': page,
|
||||
'appId': '10002'
|
||||
}
|
||||
|
||||
logger.info(f"Boss search_jobs: keyword={keyword}, city={city_code}, page={page}")
|
||||
try:
|
||||
headers = self.build_request_headers({
|
||||
"Referer": "https://www.zhipin.com/web/geek/job"
|
||||
})
|
||||
request_data = self.build_request_data(params)
|
||||
response = self.session.get(
|
||||
f"{self.serve_domain}/wapi/zpgeek/miniapp/search/joblist.json",
|
||||
headers=headers,
|
||||
params=request_data,
|
||||
timeout=30
|
||||
searcher = SearchRecJobs(
|
||||
city_code=city_code, page_size=15, client=self._client,
|
||||
)
|
||||
self._log_request_response(
|
||||
"search-jobs",
|
||||
"GET",
|
||||
f"{self.serve_domain}/wapi/zpgeek/miniapp/search/joblist.json",
|
||||
headers,
|
||||
params=request_data,
|
||||
json_body=None,
|
||||
response=response,
|
||||
)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
result = searcher.search(page_index=page)
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Boss search_jobs failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Search failed: {e}")
|
||||
return None
|
||||
logger.error(f"Boss search_jobs exception: {e}")
|
||||
return None
|
||||
|
||||
# ── asyncio.to_thread 桥接(ARCH-06)────────────────────────
|
||||
|
||||
async def async_get_job_detail(
|
||||
self, job_id: str, lid: str = "", security_id: str = ""
|
||||
) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_job_detail_by_id, job_id, lid, security_id)
|
||||
|
||||
async def async_get_company_detail(self, company_id: str) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_company_detail_by_id, company_id)
|
||||
|
||||
async def async_get_company_jobs(
|
||||
self, company_id: str, page: int = 1
|
||||
) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_company_jobs_by_id, company_id, page)
|
||||
|
||||
async def async_search_jobs(
|
||||
self, keyword: str, city_code: str = "101010100", page: int = 1
|
||||
) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.search_jobs, keyword, city_code, page)
|
||||
|
||||
@ -1,217 +1,76 @@
|
||||
import httpx
|
||||
import time
|
||||
import random
|
||||
import json
|
||||
import os
|
||||
from typing import Dict, Any, Optional, List
|
||||
from urllib.parse import quote
|
||||
"""
|
||||
前程无忧 (51Job) Service — 基于新算法文件的封装
|
||||
保持对外公开接口不变(cleaning.py / company_cleaner.py 依赖)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from loguru import logger
|
||||
from app.core.algorithms.signature import SignatureGenerator
|
||||
from jobs_spider.qcwy import search_company_jobs as qcwy_spider
|
||||
|
||||
from spiderJobs.platforms.job51.api import (
|
||||
GetCompanyInfo,
|
||||
GetJobDetail,
|
||||
SearchCompanyJobs,
|
||||
SearchRecommendJobs,
|
||||
)
|
||||
from spiderJobs.platforms.job51.client import Job51Client, create_client
|
||||
|
||||
|
||||
class QcwyService:
|
||||
def __init__(self, proxy_url: Optional[str] = None):
|
||||
self.signature_generator = SignatureGenerator("abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
|
||||
self.base_url = "https://cupid.51job.com"
|
||||
self.api_key = "51job"
|
||||
|
||||
self.base_headers = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF MacWechat/3.8.10(0x13080a10) XWEB/1227",
|
||||
"Connection": "keep-alive",
|
||||
"Accept": "*/*",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"Content-Type": "application/json",
|
||||
"account-id": "",
|
||||
"From-Domain": "51job_weixin_wxapp",
|
||||
"xweb_xhr": "1",
|
||||
"user-token": "",
|
||||
"uuid": str(int(time.time() * 1000)) + str(random.randint(10000000, 99999999)),
|
||||
"partner": "",
|
||||
"timestamp": str(int(time.time() * 1000)),
|
||||
"Sec-Fetch-Site": "cross-site",
|
||||
"Sec-Fetch-Mode": "cors",
|
||||
"Sec-Fetch-Dest": "empty",
|
||||
"Referer": "https://servicewechat.com/wx1131e5c71e668b5d/391/page-frame.html",
|
||||
"Accept-Language": "zh-CN,zh;q=0.9"
|
||||
}
|
||||
env_account_id = os.getenv("QCWY_ACCOUNT_ID", "").strip()
|
||||
env_user_token = os.getenv("QCWY_USER_TOKEN", "").strip()
|
||||
if env_account_id:
|
||||
self.base_headers["account-id"] = env_account_id
|
||||
if env_user_token:
|
||||
self.base_headers["user-token"] = env_user_token
|
||||
|
||||
client_kwargs = {
|
||||
"timeout": 30.0,
|
||||
"verify": True,
|
||||
"trust_env": False
|
||||
}
|
||||
if proxy_url:
|
||||
client_kwargs["proxy"] = proxy_url
|
||||
self.client = httpx.Client(**client_kwargs)
|
||||
self._client = create_client(proxy=proxy_url or None)
|
||||
|
||||
def set_proxy(self, proxy_url: Optional[str]) -> None:
|
||||
client_kwargs = {
|
||||
"timeout": 30.0,
|
||||
"verify": True,
|
||||
"trust_env": False,
|
||||
}
|
||||
if proxy_url:
|
||||
client_kwargs["proxy"] = proxy_url
|
||||
try:
|
||||
old_client = self.client
|
||||
except AttributeError:
|
||||
old_client = None
|
||||
self.client = httpx.Client(**client_kwargs)
|
||||
if old_client is not None:
|
||||
try:
|
||||
old_client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
|
||||
masked_headers: Dict[str, Any] = {}
|
||||
for k, v in headers.items():
|
||||
key_lower = str(k).lower()
|
||||
if key_lower in {"authorization", "cookie", "set-cookie"}:
|
||||
masked_headers[k] = "***"
|
||||
else:
|
||||
masked_headers[k] = v
|
||||
return masked_headers
|
||||
|
||||
def _log_request_response(
|
||||
self,
|
||||
label: str,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: Dict[str, Any],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None,
|
||||
response: Optional[httpx.Response] = None,
|
||||
) -> None:
|
||||
safe_headers = self._sanitize_headers(headers)
|
||||
logger.info(
|
||||
f"[Qcwy-{label}] request method={method} url={url} headers={safe_headers} "
|
||||
f"params={params} json={json_body}"
|
||||
)
|
||||
if response is not None:
|
||||
text_sample = ""
|
||||
try:
|
||||
body = response.text or ""
|
||||
text_sample = body[:1000]
|
||||
except Exception:
|
||||
text_sample = "<unreadable>"
|
||||
logger.info(
|
||||
f"[Qcwy-{label}] response status={response.status_code} "
|
||||
f"headers={self._sanitize_headers(dict(response.headers))} "
|
||||
f"body_sample={text_sample}"
|
||||
)
|
||||
|
||||
def build_property(self, page_code: str = "home|hotjob|jobfxlist") -> str:
|
||||
distinct_id = str(int(time.time() * 1000)) + str(random.randint(100000, 999999))
|
||||
property_data = {
|
||||
"frompageUrl": "",
|
||||
"pageUrl": "pages/index/index",
|
||||
"isLogin": "否",
|
||||
"accountid": "",
|
||||
"resumeId": "",
|
||||
"firstFrompageUrl": "",
|
||||
"distinct_id": distinct_id,
|
||||
"pageCode": page_code,
|
||||
"shortPageCode": page_code,
|
||||
"policyType": "推荐"
|
||||
}
|
||||
return quote(json.dumps(property_data, ensure_ascii=False, separators=(',', ':')))
|
||||
|
||||
def _make_request(self, url: str, data: Dict[str, Any] = None, headers: Dict[str, str] = None, method: str = "POST") -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
local_headers: Dict[str, str] = headers or {}
|
||||
if method.upper() == "GET":
|
||||
response = self.client.get(url, headers=local_headers)
|
||||
else:
|
||||
response = self.client.post(url, headers=local_headers, json=data)
|
||||
self._log_request_response(
|
||||
"request",
|
||||
method.upper(),
|
||||
url,
|
||||
local_headers,
|
||||
params=None,
|
||||
json_body=data if method.upper() != "GET" else None,
|
||||
response=response,
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
logger.warning(f"Request failed: {response.status_code} - {response.text}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Request exception: {e}")
|
||||
return None
|
||||
self._client = create_client(proxy=proxy_url or None)
|
||||
logger.info(f"QcwyService proxy set to: {proxy_url or 'direct'}")
|
||||
|
||||
def get_job_detail(self, job_id: str) -> Dict[str, Any]:
|
||||
timestamp = int(time.time())
|
||||
api_path = f"open/noauth/jobs/detail/base/{job_id}"
|
||||
url_path = f"/{api_path}?api_key={self.api_key}×tamp={timestamp}"
|
||||
full_url = f"{self.base_url}{url_path}"
|
||||
|
||||
signature = self.signature_generator.generate_signature(url_path)
|
||||
property_value = self.build_property(page_code="pages/jobs/jobdetail/jobdetail")
|
||||
|
||||
headers = self.base_headers.copy()
|
||||
headers["sign"] = signature
|
||||
headers["property"] = property_value
|
||||
headers["Content-Type"] = "application/x-www-form-urlencoded"
|
||||
|
||||
response = self._make_request(full_url, None, headers, method="GET")
|
||||
if response and response.get('status') in ['1', 1]:
|
||||
return response.get('resultbody', {})
|
||||
return {}
|
||||
|
||||
def get_company_info(self, company_id: str) -> Dict[str, Any]:
|
||||
"""获取职位详情"""
|
||||
logger.info(f"Qcwy get_job_detail: {job_id}")
|
||||
try:
|
||||
return qcwy_spider.get_company_info(company_id)
|
||||
fetcher = GetJobDetail(job_id=job_id, client=self._client)
|
||||
result = fetcher.fetch()
|
||||
if result.success:
|
||||
return result.data or {}
|
||||
logger.warning(f"Qcwy get_job_detail failed: {result.error}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"Qcwy get_company_info failed: {e}")
|
||||
logger.error(f"Qcwy get_job_detail exception: {e}")
|
||||
return {}
|
||||
|
||||
def search_jobs(self, keyword: str, job_area: str = "020000", page: int = 1) -> List[Dict[str, Any]]:
|
||||
# This uses the recommend/search logic
|
||||
timestamp = int(time.time())
|
||||
data = {
|
||||
"pageNo": page,
|
||||
"pageSize": 20,
|
||||
"keyword": keyword, # QCwy usually recommends, but let's assume recommend for now or search
|
||||
"jobArea": job_area,
|
||||
"type": "recommend", # fallback to recommend if keyword search API is different/complex
|
||||
"isTouristMode": True,
|
||||
"specialPageCode": True
|
||||
}
|
||||
# Note: QCwy search API might be different, but using the recommend endpoint from original script
|
||||
# If real search is needed, we might need to reverse engineer 'search/job-list' endpoint.
|
||||
# For now, let's stick to what was in the script or use recommend.
|
||||
# The original script used `open/noauth/recommend/job-tab-dynamic-wx-mini`
|
||||
|
||||
api_path = "open/noauth/recommend/job-tab-dynamic-wx-mini"
|
||||
url_path = f"/{api_path}?api_key={self.api_key}×tamp={timestamp}"
|
||||
full_url = f"{self.base_url}{url_path}"
|
||||
|
||||
signature = self.signature_generator.generate_signature(url_path, data)
|
||||
property_value = self.build_property()
|
||||
|
||||
headers = self.base_headers.copy()
|
||||
headers["sign"] = signature
|
||||
headers["property"] = property_value
|
||||
|
||||
# Convert bools
|
||||
for key, value in data.items():
|
||||
if isinstance(value, bool):
|
||||
data[key] = "true" if value else "false"
|
||||
|
||||
response = self._make_request(full_url, data, headers, method="POST")
|
||||
if response and response.get("status") in ['1', 1]:
|
||||
return response.get("resultbody", {}).get("jobList", {}).get("items", [])
|
||||
return []
|
||||
def get_company_info(self, company_id: str) -> Dict[str, Any]:
|
||||
"""获取公司信息"""
|
||||
logger.info(f"Qcwy get_company_info: {company_id}")
|
||||
try:
|
||||
fetcher = GetCompanyInfo(company_id=company_id, client=self._client)
|
||||
result = fetcher.fetch()
|
||||
if result.success:
|
||||
return result.data or {}
|
||||
logger.warning(f"Qcwy get_company_info failed: {result.error}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"Qcwy get_company_info exception: {e}")
|
||||
return {}
|
||||
|
||||
def search_jobs(
|
||||
self, keyword: str, job_area: str = "020000", page: int = 1
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""搜索职位(返回列表)"""
|
||||
logger.info(f"Qcwy search_jobs: keyword={keyword}, area={job_area}, page={page}")
|
||||
try:
|
||||
searcher = SearchRecommendJobs(
|
||||
job_area=job_area, page_size=20, client=self._client,
|
||||
)
|
||||
result = searcher.search(page_index=page)
|
||||
if result.success:
|
||||
return result.list or []
|
||||
logger.warning(f"Qcwy search_jobs failed: {result.error}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Qcwy search_jobs exception: {e}")
|
||||
return []
|
||||
|
||||
def get_company_jobs_by_id(
|
||||
self,
|
||||
@ -222,12 +81,47 @@ class QcwyService:
|
||||
function: str = "",
|
||||
salary_type: str = "",
|
||||
) -> Dict[str, Any]:
|
||||
"""获取公司职位列表"""
|
||||
logger.info(f"Qcwy get_company_jobs: company={company_id}, page={page}")
|
||||
try:
|
||||
return qcwy_spider.company_jobs_by_id(
|
||||
co_id=company_id,
|
||||
page=page,
|
||||
size=page_size,
|
||||
searcher = SearchCompanyJobs(
|
||||
company_id=company_id,
|
||||
job_area=job_area,
|
||||
function=function,
|
||||
salary_type=salary_type,
|
||||
page_size=page_size,
|
||||
client=self._client,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Qcwy get_company_jobs_by_id failed: {e}")
|
||||
result = searcher.search(page_index=page)
|
||||
if result.success:
|
||||
return result.data or {}
|
||||
logger.warning(f"Qcwy get_company_jobs failed: {result.error}")
|
||||
return {}
|
||||
except Exception as e:
|
||||
logger.error(f"Qcwy get_company_jobs exception: {e}")
|
||||
return {}
|
||||
|
||||
# ── asyncio.to_thread 桥接(ARCH-06)────────────────────────
|
||||
|
||||
async def async_get_job_detail(self, job_id: str) -> Dict:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_job_detail, job_id)
|
||||
|
||||
async def async_get_company_info(self, company_id: str) -> Dict:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_company_info, company_id)
|
||||
|
||||
async def async_get_company_jobs(
|
||||
self, company_id: str, page: int = 1, page_size: int = 30, **kwargs
|
||||
) -> Dict:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(
|
||||
self.get_company_jobs_by_id, company_id, page, page_size
|
||||
)
|
||||
|
||||
async def async_search_jobs(
|
||||
self, keyword: str, job_area: str = "020000", page: int = 1
|
||||
) -> List:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.search_jobs, keyword, job_area, page)
|
||||
|
||||
|
||||
@ -1,328 +1,67 @@
|
||||
import os
|
||||
import requests
|
||||
import time
|
||||
import random
|
||||
from typing import Dict, Any, List, Optional
|
||||
from urllib.parse import urlencode
|
||||
"""
|
||||
智联招聘 Service — 基于新算法文件的封装
|
||||
保持对外公开接口不变(cleaning.py / company_cleaner.py 依赖)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from spiderJobs.platforms.zhilian.api import (
|
||||
GetCompanyDetail,
|
||||
GetPositionDetail,
|
||||
SearchCompanyPositions,
|
||||
SearchPositions,
|
||||
)
|
||||
from spiderJobs.platforms.zhilian.client import (
|
||||
ZhilianClient,
|
||||
create_capi_client,
|
||||
create_cgate_client,
|
||||
)
|
||||
from spiderJobs.platforms.zhilian.sign import ZhilianSign
|
||||
|
||||
|
||||
class ZhilianService:
|
||||
def __init__(self, proxy_url: Optional[str] = None):
|
||||
self.session = requests.Session()
|
||||
if proxy_url:
|
||||
self.session.proxies = {"http": proxy_url, "https": proxy_url}
|
||||
self._signer = ZhilianSign()
|
||||
self._cgate = create_cgate_client(signer=self._signer, proxy=proxy_url or None)
|
||||
self._capi = create_capi_client(signer=self._signer, proxy=proxy_url or None)
|
||||
|
||||
def set_proxy(self, proxy_url: Optional[str]) -> None:
|
||||
if not proxy_url:
|
||||
self.session.proxies = {}
|
||||
return
|
||||
proxy_url = proxy_url.strip().strip("`")
|
||||
self.session.proxies = {"http": proxy_url, "https": proxy_url}
|
||||
|
||||
def _sanitize_headers(self, headers: Dict[str, Any]) -> Dict[str, Any]:
|
||||
masked_headers: Dict[str, Any] = {}
|
||||
for k, v in headers.items():
|
||||
key_lower = str(k).lower()
|
||||
if key_lower in {"authorization", "cookie", "set-cookie"}:
|
||||
masked_headers[k] = "***"
|
||||
else:
|
||||
masked_headers[k] = v
|
||||
return masked_headers
|
||||
|
||||
def _log_request_response(
|
||||
self,
|
||||
label: str,
|
||||
method: str,
|
||||
url: str,
|
||||
headers: Dict[str, Any],
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None,
|
||||
response: Optional[requests.Response] = None,
|
||||
) -> None:
|
||||
safe_headers = self._sanitize_headers(headers)
|
||||
logger.info(
|
||||
f"[Zhilian-{label}] request method={method} url={url} headers={safe_headers} "
|
||||
f"params={params} json={json_body}"
|
||||
)
|
||||
try:
|
||||
curl_url = url
|
||||
if params and isinstance(params, dict):
|
||||
query_string = urlencode(params)
|
||||
if query_string:
|
||||
separator = "&" if "?" in curl_url else "?"
|
||||
curl_url = f"{curl_url}{separator}{query_string}"
|
||||
header_parts = []
|
||||
for k, v in safe_headers.items():
|
||||
v_str = str(v).replace("'", "'\"'\"'")
|
||||
header_parts.append(f"-H '{k}: {v_str}'")
|
||||
data_part = ""
|
||||
if json_body is not None:
|
||||
body_str = json.dumps(json_body, ensure_ascii=False)
|
||||
body_str = body_str.replace("'", "'\"'\"'")
|
||||
data_part = f" --data '{body_str}'"
|
||||
curl_cmd = f"curl -X {method.upper()} '{curl_url}' " + " ".join(header_parts) + data_part
|
||||
logger.info(f"[Zhilian-{label}] curl_debug {curl_cmd}")
|
||||
except Exception as e:
|
||||
logger.debug(f"[Zhilian-{label}] build curl error: {e}")
|
||||
if response is not None:
|
||||
text_sample = ""
|
||||
try:
|
||||
body = response.text or ""
|
||||
text_sample = body[:1000]
|
||||
except Exception:
|
||||
text_sample = "<unreadable>"
|
||||
logger.info(
|
||||
f"[Zhilian-{label}] response status={response.status_code} "
|
||||
f"headers={self._sanitize_headers(dict(response.headers))} "
|
||||
f"body_sample={text_sample}"
|
||||
)
|
||||
|
||||
def _gen_client_id(self) -> str:
|
||||
t = int(time.time() * 1000)
|
||||
try:
|
||||
t += int(time.perf_counter() * 1000)
|
||||
except Exception:
|
||||
pass
|
||||
def repl(c: str) -> str:
|
||||
n = int((t + random.random() * 16) % 16)
|
||||
if c == 'x':
|
||||
return hex(n)[2:]
|
||||
return hex((n & 0x3) | 0x8)[2:]
|
||||
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
|
||||
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
|
||||
|
||||
def _gen_v(self) -> float:
|
||||
return round(random.random(), 8)
|
||||
|
||||
def _gen_page_request_id(self) -> str:
|
||||
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
|
||||
|
||||
def _build_headers_pc(self) -> Dict[str, str]:
|
||||
return {
|
||||
"accept": "application/json, text/plain, */*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"content-type": "application/json;charset=UTF-8",
|
||||
"origin": "https://www.zhaopin.com",
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.zhaopin.com/",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-site",
|
||||
"x-zp-page-code": "0",
|
||||
}
|
||||
|
||||
def _request_json(self, method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
resp = self.session.request(method.upper(), url, headers=headers, params=params, json=json_body, timeout=timeout)
|
||||
self._log_request_response(
|
||||
"request",
|
||||
method.upper(),
|
||||
url,
|
||||
headers,
|
||||
params=params,
|
||||
json_body=json_body,
|
||||
response=resp,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except Exception as e:
|
||||
logger.error(f"Request failed: {e}")
|
||||
return None
|
||||
|
||||
def fetch_company_desc_by_job(self, number: str) -> Optional[str]:
|
||||
client_id = self._gen_client_id()
|
||||
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
|
||||
params_pc = {
|
||||
"number": number,
|
||||
"_v": self._gen_v(),
|
||||
"x-zp-page-request-id": self._gen_page_request_id(),
|
||||
"x-zp-client-id": client_id,
|
||||
}
|
||||
headers_pc = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
"Cookie": f"x-zp-client-id={client_id}"
|
||||
}
|
||||
# Merge basic headers
|
||||
headers_pc.update(self._build_headers_pc())
|
||||
|
||||
data_pc = self._request_json("GET", url_pc, headers_pc, params=params_pc)
|
||||
if data_pc and isinstance(data_pc, dict):
|
||||
detail = data_pc.get("data") or {}
|
||||
comp = detail.get("detailedCompany") or {}
|
||||
desc_pc = comp.get("companyDescription")
|
||||
if isinstance(desc_pc, str) and desc_pc:
|
||||
return desc_pc
|
||||
return None
|
||||
|
||||
def search_jobs(self, city_id: int = 801, page_size: int = 15, page_index: int = 1, job_level3_code: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
headers = self._build_headers_pc()
|
||||
base_url = "https://fe-api.zhaopin.com/c/i/search/positions"
|
||||
|
||||
params = {
|
||||
"_v": self._gen_v(),
|
||||
"x-zp-page-request-id": self._gen_page_request_id(),
|
||||
"x-zp-client-id": self._gen_client_id(),
|
||||
}
|
||||
payload = {
|
||||
"S_SOU_WORK_CITY": "",
|
||||
"order": 4,
|
||||
"pageSize": page_size,
|
||||
"pageIndex": page_index,
|
||||
"eventScenario": "pcSearchedSouSearch",
|
||||
"anonymous": 1,
|
||||
"platform": 13,
|
||||
"version": "0.0.0",
|
||||
}
|
||||
if job_level3_code:
|
||||
payload["S_SOU_JD_JOB_LEVEL3"] = job_level3_code
|
||||
|
||||
data = self._request_json("POST", base_url, headers, params=params, json_body=payload)
|
||||
if data and data.get("code") == 200:
|
||||
lst = data.get("data", {}).get("list", [])
|
||||
for job in lst:
|
||||
num = job.get("number")
|
||||
if num:
|
||||
desc = self.fetch_company_desc_by_job(str(num)) or ""
|
||||
job["companyDesc"] = desc
|
||||
return lst
|
||||
return []
|
||||
|
||||
def search_company_jobs_by_name(self, company_name: str, city_id: Optional[int] = None, page_size: int = 15, page_index: int = 1) -> Optional[Dict[str, Any]]:
|
||||
url = "https://cgate.zhaopin.com/positionbusiness/searchrecommend/searchPositions"
|
||||
ua = os.getenv(
|
||||
"ZP_MINIAPP_UA",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Mac "
|
||||
"MacWechat/WMPF MacWechat/3.8.7(0x13080712) UnifiedPCMacWechat(0xf26414f0) XWEB/16962",
|
||||
)
|
||||
headers: Dict[str, Any] = {
|
||||
"User-Agent": ua,
|
||||
"Content-Type": "application/json",
|
||||
"x-zp-channel": "wxxiaochengxu",
|
||||
"x-zp-business-system": "73",
|
||||
"x-zp-action-id": "",
|
||||
"xweb_xhr": "1",
|
||||
"x-zp-page-code": "7019",
|
||||
"x-zp-version": os.getenv("ZP_MINIAPP_VERSION", "4.1.224"),
|
||||
"x-zp-platform": "12",
|
||||
"x-zp-device-id": os.getenv("ZP_MINIAPP_DEVICE_ID", "A774EA47-0AB5-4608-B51D-84BF51CC0786"),
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/617/page-frame.html",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
}
|
||||
at_token = os.getenv("ZP_MINIAPP_AT", "").strip()
|
||||
rt_token = os.getenv("ZP_MINIAPP_RT", "").strip()
|
||||
if at_token:
|
||||
headers["x-zp-at"] = at_token
|
||||
if rt_token:
|
||||
headers["x-zp-rt"] = rt_token
|
||||
|
||||
body: Dict[str, Any] = {
|
||||
"eventScenario": "wxmpZhaopinSearchV2",
|
||||
"pageIndex": page_index,
|
||||
"pageSize": page_size,
|
||||
"filterMinSalary": 1,
|
||||
"S_SOU_EXPAND": "SOU_COMPANY_ID",
|
||||
"S_SOU_FULL_INDEX": company_name,
|
||||
"sortType": "DEFAULT",
|
||||
"version": "8.11.22",
|
||||
"identity": "2",
|
||||
"anonymous": 0,
|
||||
}
|
||||
if city_id is not None:
|
||||
body["S_SOU_WORK_CITY"] = city_id
|
||||
resume_number = os.getenv("ZP_MINIAPP_RESUME_NUMBER", "").strip()
|
||||
if resume_number:
|
||||
body["resumeNumber"] = resume_number
|
||||
|
||||
try:
|
||||
resp = self.session.post(url, headers=headers, json=body, timeout=30)
|
||||
self._log_request_response(
|
||||
"search-company-jobs",
|
||||
"POST",
|
||||
url,
|
||||
headers,
|
||||
params=None,
|
||||
json_body=body,
|
||||
response=resp,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian search_company_jobs_by_name failed: {e}")
|
||||
return None
|
||||
proxy = proxy_url.strip().strip("`") if proxy_url else None
|
||||
self._cgate = create_cgate_client(signer=self._signer, proxy=proxy)
|
||||
self._capi = create_capi_client(signer=self._signer, proxy=proxy)
|
||||
logger.info(f"ZhilianService proxy set to: {proxy or 'direct'}")
|
||||
|
||||
def get_job_detail(self, job_number: str) -> Optional[Dict[str, Any]]:
|
||||
# Reuse fetch_company_desc_by_job logic but return full detail
|
||||
client_id = self._gen_client_id()
|
||||
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
|
||||
params_pc = {
|
||||
"number": job_number,
|
||||
"_v": self._gen_v(),
|
||||
"x-zp-page-request-id": self._gen_page_request_id(),
|
||||
"x-zp-client-id": client_id,
|
||||
}
|
||||
headers_pc = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
"Cookie": f"x-zp-client-id={client_id}"
|
||||
}
|
||||
headers_pc.update(self._build_headers_pc())
|
||||
|
||||
data_pc = self._request_json("GET", url_pc, headers_pc, params=params_pc)
|
||||
if data_pc and isinstance(data_pc, dict):
|
||||
return data_pc.get("data")
|
||||
return None
|
||||
"""获取职位详情"""
|
||||
logger.info(f"Zhilian get_job_detail: {job_number}")
|
||||
try:
|
||||
fetcher = GetPositionDetail(number=job_number, client=self._cgate)
|
||||
result = fetcher.fetch()
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Zhilian get_job_detail failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian get_job_detail exception: {e}")
|
||||
return None
|
||||
|
||||
def get_company_detail(self, company_number: str) -> Optional[Dict[str, Any]]:
|
||||
"""获取公司详情"""
|
||||
url = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
|
||||
params = {
|
||||
"number": company_number,
|
||||
"platform": "12",
|
||||
"version": "0.0.0",
|
||||
}
|
||||
ua = os.getenv(
|
||||
"ZP_MINIAPP_UA",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Mac "
|
||||
"MacWechat/WMPF MacWechat/3.8.7(0x13080712) UnifiedPCMacWechat(0xf26414f0) XWEB/16962",
|
||||
)
|
||||
headers = {
|
||||
"User-Agent": ua,
|
||||
"x-zp-channel": "wxxiaochengxu",
|
||||
"x-zp-business-system": "73",
|
||||
"xweb_xhr": "1",
|
||||
"x-zp-page-code": "0",
|
||||
"x-zp-version": os.getenv("ZP_MINIAPP_VERSION", "4.1.224"),
|
||||
"x-zp-platform": "12",
|
||||
"x-zp-device-id": os.getenv("ZP_MINIAPP_DEVICE_ID", "A774EA47-0AB5-4608-B51D-84BF51CC0786"),
|
||||
"content-type": "application/json",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/617/page-frame.html",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
}
|
||||
at_token = os.getenv("ZP_MINIAPP_AT", "").strip()
|
||||
rt_token = os.getenv("ZP_MINIAPP_RT", "").strip()
|
||||
if at_token:
|
||||
headers["x-zp-at"] = at_token
|
||||
if rt_token:
|
||||
headers["x-zp-rt"] = rt_token
|
||||
|
||||
data = self._request_json("GET", url, headers, params=params)
|
||||
if data and isinstance(data, dict):
|
||||
return data.get("data")
|
||||
return None
|
||||
logger.info(f"Zhilian get_company_detail: {company_number}")
|
||||
try:
|
||||
fetcher = GetCompanyDetail(number=company_number, client=self._cgate)
|
||||
result = fetcher.fetch()
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Zhilian get_company_detail failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian get_company_detail exception: {e}")
|
||||
return None
|
||||
|
||||
def get_company_jobs_by_id(
|
||||
self,
|
||||
@ -331,56 +70,102 @@ class ZhilianService:
|
||||
page_size: int = 30,
|
||||
work_city: Optional[int] = None,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
url = "https://capi.zhaopin.com/capi/searchrecommend/searchPositionsCompany"
|
||||
ua = os.getenv(
|
||||
"ZP_MINIAPP_UA",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/7.0.20.1781(0x6700143B) NetType/WIFI MiniProgramEnv/Mac "
|
||||
"MacWechat/WMPF MacWechat/3.8.7(0x13080712) UnifiedPCMacWechat(0xf26414f0) XWEB/16962",
|
||||
"""获取公司职位列表"""
|
||||
logger.info(f"Zhilian get_company_jobs: company={company_number}, page={page_index}")
|
||||
try:
|
||||
searcher = SearchCompanyPositions(
|
||||
company_id=company_number,
|
||||
city_code=str(work_city) if work_city is not None else "",
|
||||
page_size=page_size,
|
||||
client=self._capi,
|
||||
)
|
||||
result = searcher.search(page_index=page_index)
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Zhilian get_company_jobs failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian get_company_jobs exception: {e}")
|
||||
return None
|
||||
|
||||
def search_company_jobs_by_name(
|
||||
self,
|
||||
company_name: str,
|
||||
city_id: Optional[int] = None,
|
||||
page_size: int = 15,
|
||||
page_index: int = 1,
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""按公司名搜索职位"""
|
||||
logger.info(f"Zhilian search_company_jobs_by_name: {company_name}")
|
||||
try:
|
||||
searcher = SearchPositions(
|
||||
keyword=company_name,
|
||||
city_code=city_id if city_id is not None else "",
|
||||
page_size=page_size,
|
||||
client=self._cgate,
|
||||
)
|
||||
result = searcher.search(page_index=page_index)
|
||||
if result.success:
|
||||
return result.data
|
||||
logger.warning(f"Zhilian search_company_jobs failed: {result.error}")
|
||||
return None
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian search_company_jobs exception: {e}")
|
||||
return None
|
||||
|
||||
def search_jobs(
|
||||
self,
|
||||
city_id: int = 801,
|
||||
page_size: int = 15,
|
||||
page_index: int = 1,
|
||||
job_level3_code: Optional[str] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""搜索职位(返回列表)"""
|
||||
logger.info(f"Zhilian search_jobs: city={city_id}, page={page_index}")
|
||||
try:
|
||||
filters = {}
|
||||
if job_level3_code:
|
||||
filters["S_SOU_POSITION_TYPE"] = job_level3_code
|
||||
searcher = SearchPositions(
|
||||
city_code=city_id,
|
||||
filters=filters,
|
||||
page_size=page_size,
|
||||
client=self._cgate,
|
||||
)
|
||||
result = searcher.search(page_index=page_index)
|
||||
if result.success:
|
||||
return result.list or []
|
||||
logger.warning(f"Zhilian search_jobs failed: {result.error}")
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian search_jobs exception: {e}")
|
||||
return []
|
||||
|
||||
# ── asyncio.to_thread 桥接(ARCH-06)────────────────────────
|
||||
|
||||
async def async_get_job_detail(self, job_number: str) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_job_detail, job_number)
|
||||
|
||||
async def async_get_company_detail(self, company_number: str) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(self.get_company_detail, company_number)
|
||||
|
||||
async def async_get_company_jobs(
|
||||
self, company_number: str, page_index: int = 1, page_size: int = 30,
|
||||
work_city: Optional[int] = None,
|
||||
) -> Optional[Dict]:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(
|
||||
self.get_company_jobs_by_id, company_number, page_index, page_size, work_city
|
||||
)
|
||||
version = os.getenv("ZP_MINIAPP_CAPI_VERSION", "4.1.230")
|
||||
device_id = os.getenv("ZP_MINIAPP_DEVICE_ID", "CFD341F3-29D6-4C46-81BF-F6C705407F2E")
|
||||
headers: Dict[str, Any] = {
|
||||
"User-Agent": ua,
|
||||
"x-zp-channel": "wxxiaochengxu",
|
||||
"x-zp-business-system": "73",
|
||||
"x-zp-action-id": "",
|
||||
"xweb_xhr": "1",
|
||||
"x-zp-page-code": "0",
|
||||
"x-zp-version": version,
|
||||
"x-zp-platform": "12",
|
||||
"x-zp-device-id": device_id,
|
||||
"content-type": "application/json",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://servicewechat.com/wxb7718fb9257e4fd2/619/page-frame.html",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
}
|
||||
at_token = os.getenv("ZP_MINIAPP_AT", "").strip()
|
||||
rt_token = os.getenv("ZP_MINIAPP_RT", "").strip()
|
||||
params: Dict[str, Any] = {
|
||||
"channel": "wxxiaochengxu",
|
||||
"platform": "12",
|
||||
"version": version,
|
||||
"d": device_id,
|
||||
"S_SOU_COMPANY_ID": company_number,
|
||||
"S_SOU_POSITION_SOURCE_TYPE": 1,
|
||||
"eventScenario": "wxmpZhaopinSearchPositionsCompany",
|
||||
"pageCode": "wxmpZhaopinCompanyDetailPage",
|
||||
"pageIndex": page_index,
|
||||
"pageSize": page_size,
|
||||
"S_SOU_JD_JOB_LEVEL": "",
|
||||
}
|
||||
if at_token:
|
||||
params["at"] = at_token
|
||||
if rt_token:
|
||||
params["rt"] = rt_token
|
||||
if work_city is not None:
|
||||
params["S_SOU_WORK_CITY"] = work_city
|
||||
else:
|
||||
params["S_SOU_WORK_CITY"] = ""
|
||||
data = self._request_json("GET", url, headers, params=params)
|
||||
return data
|
||||
|
||||
async def async_search_jobs(
|
||||
self, city_id: int = 801, page_size: int = 15, page_index: int = 1,
|
||||
job_level3_code: Optional[str] = None,
|
||||
) -> List:
|
||||
import asyncio
|
||||
return await asyncio.to_thread(
|
||||
self.search_jobs, city_id, page_size, page_index, job_level3_code
|
||||
)
|
||||
|
||||
|
||||
@ -1,38 +0,0 @@
|
||||
# 使用Python 3.9作为基础镜像
|
||||
FROM python:3.9-slim
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 设置环境变量
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
|
||||
# 配置环境变量(可通过docker run -e 覆盖)
|
||||
ENV API_BASE_URL=http://124.222.245.240:9999 \
|
||||
MONGODB_URI=mongodb://localhost:27017 \
|
||||
MONGODB_DB=job_data \
|
||||
MAX_PAGES=3 \
|
||||
PAGE_SIZE=15 \
|
||||
MIN_WAIT_TIME=10 \
|
||||
MAX_WAIT_TIME=30 \
|
||||
ERROR_WAIT_MIN=30 \
|
||||
ERROR_WAIT_MAX=60
|
||||
|
||||
# 复制requirements文件并安装Python依赖
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制项目文件
|
||||
COPY boos_api.py ./
|
||||
COPY city.json ./
|
||||
COPY work.json ./
|
||||
|
||||
# 创建非root用户
|
||||
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
|
||||
USER crawler
|
||||
|
||||
# 启动命令
|
||||
CMD ["python", "boos_api.py"]
|
||||
@ -1,30 +0,0 @@
|
||||
# 使用Python 3.9作为基础镜像
|
||||
FROM python:3.9-slim
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 设置环境变量
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
|
||||
# 配置环境变量(可通过docker run -e 覆盖)
|
||||
ENV API_BASE_URL=http://124.222.106.226:9999
|
||||
|
||||
# 复制requirements文件并安装Python依赖
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制项目文件
|
||||
COPY boos_api.py ./
|
||||
COPY city.json ./
|
||||
COPY work.json ./
|
||||
|
||||
# 创建非root用户
|
||||
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
|
||||
USER crawler
|
||||
|
||||
# 启动命令
|
||||
CMD ["python", "boos_api.py"]
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,282 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
import uuid
|
||||
import random
|
||||
from typing import Any, Dict, Optional
|
||||
import sqlite3
|
||||
|
||||
import requests
|
||||
try:
|
||||
import httpx
|
||||
except Exception:
|
||||
httpx = None
|
||||
|
||||
API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
|
||||
|
||||
|
||||
def _gen_traceid() -> str:
|
||||
"""生成简易 traceid。"""
|
||||
base = uuid.uuid4().hex[:12]
|
||||
return f"M-{base}"
|
||||
|
||||
|
||||
def report_universal(items: list, data_type: str = "job") -> bool:
|
||||
"""上报功能已禁用,始终返回 False。"""
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
def build_headers(
|
||||
user_agent: Optional[str] = None,
|
||||
referer: Optional[str] = None,
|
||||
cookies: Optional[str] = None,
|
||||
extra: Optional[Dict[str, str]] = None,
|
||||
) -> Dict[str, str]:
|
||||
"""构造带签名头的请求头。"""
|
||||
headers: Dict[str, str] = {
|
||||
"accept": "*/*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"accept-encoding": "gzip, deflate, br",
|
||||
"connection": "keep-alive",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"xweb_xhr": "1",
|
||||
'wt2': "Epwo8bHXTy5wLU5ETExV2Ss5OwloFG3eJ0Pfe6T3FyIdDJhEyGkcxea9wI5VSqX-tafKQcVQJTI2szwdO0xQz3A~~",
|
||||
"mpt":"21728a788201acffa22d876d1fc0e8d7",
|
||||
"x-requested-with": "XMLHttpRequest",
|
||||
"User-Agent": user_agent
|
||||
or (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
|
||||
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
|
||||
),
|
||||
"referer": referer or "https://servicewechat.com/wxa8da525af05281f3/586/page-frame.html",
|
||||
"content-type": os.getenv("BOSS_CT", "application/x-www-form-urlencoded"),
|
||||
}
|
||||
extra = extra or {}
|
||||
headers.update({k: v for k, v in extra.items() if v})
|
||||
if cookies:
|
||||
headers["Cookie"] = cookies
|
||||
return headers
|
||||
|
||||
|
||||
def call(
|
||||
query: str,
|
||||
city: str = "101020100",
|
||||
page: int = 1,
|
||||
page_size: int = 15,
|
||||
use_http2: bool = False,
|
||||
timeout: float = 10.0,
|
||||
) -> Any:
|
||||
"""调用 Boss 搜索职位列表并返回结果。"""
|
||||
url = "https://www.zhipin.com/wapi/zpgeek/miniapp/search/joblist.json"
|
||||
app_id = os.getenv("BOSS_APP_ID", "10002")
|
||||
extra_headers = {
|
||||
"mini_ver": os.getenv("BOSS_MINI_VER", "100.0000"),
|
||||
"ua": os.getenv("BOSS_UA_JSON", '{"model":"Mac16,8","platform":"mac"}'),
|
||||
"wt2": os.getenv("BOSS_WT2"),
|
||||
"zp_app_id": os.getenv("BOSS_ZP_APP_ID", app_id),
|
||||
"traceid": os.getenv("BOSS_TRACEID") or _gen_traceid(),
|
||||
"mpt": os.getenv("BOSS_MPT"),
|
||||
"scene": os.getenv("BOSS_SCENE_HEADER", "1089"),
|
||||
"zp_product_id": os.getenv("BOSS_ZP_PRODUCT_ID", app_id),
|
||||
"platform": os.getenv("BOSS_PLATFORM", "zhipin/mac"),
|
||||
"ver": os.getenv("BOSS_VER", "100.0000"),
|
||||
}
|
||||
headers = build_headers(
|
||||
user_agent=os.getenv("BOSS_USER_AGENT"),
|
||||
referer=os.getenv("BOSS_REFERER"),
|
||||
cookies=os.getenv("BOSS_COOKIES"),
|
||||
extra=extra_headers,
|
||||
)
|
||||
params = {
|
||||
"pageSize": str(page_size),
|
||||
"query": query,
|
||||
"city": city,
|
||||
"source": "1",
|
||||
"sortType": "0",
|
||||
"isSupplySearch": "true",
|
||||
"page": str(page),
|
||||
"appId": app_id,
|
||||
}
|
||||
enc_expect = os.getenv("BOSS_ENCRYPT_EXPECT_ID")
|
||||
if enc_expect:
|
||||
params["encryptExpectId"] = enc_expect
|
||||
skip_verify = os.getenv("BOSS_SKIP_VERIFY", "0") == "1"
|
||||
if use_http2 and httpx is not None:
|
||||
with httpx.Client(http2=True, headers=headers, timeout=timeout, verify=not skip_verify, trust_env=False) as client:
|
||||
resp = client.get(url, params=params)
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return resp.text
|
||||
session = requests.Session()
|
||||
session.trust_env = False
|
||||
resp = session.get(url, params=params, headers=headers, timeout=timeout, verify=not skip_verify)
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return resp.text
|
||||
|
||||
def _load_keywords(path: str) -> list:
|
||||
"""Load keywords from a UTF-8 text file, one per line.
|
||||
|
||||
Args:
|
||||
path (str): File path.
|
||||
|
||||
Returns:
|
||||
list: Non-empty trimmed lines.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
lines = [ln.strip() for ln in f.readlines()]
|
||||
return [ln for ln in lines if ln]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _progress_iter(seq: list, desc: str = "", total: Optional[int] = None):
|
||||
"""Iterate with a simple console progress bar.
|
||||
|
||||
Args:
|
||||
seq (list): Items to iterate.
|
||||
desc (str): Progress description.
|
||||
total (Optional[int]): Total count for percentage.
|
||||
|
||||
Yields:
|
||||
Any: Items from seq.
|
||||
"""
|
||||
|
||||
n = 0
|
||||
m = total if total is not None else len(seq)
|
||||
bar_len = 24
|
||||
for item in seq:
|
||||
n += 1
|
||||
filled = int(bar_len * n / m) if m else 0
|
||||
bar = "#" * filled + "-" * (bar_len - filled)
|
||||
pct = int(100 * n / m) if m else 100
|
||||
print(f"\r{desc} [{bar}] {n}/{m} {pct}%", end="", flush=True)
|
||||
yield item
|
||||
print("", flush=True)
|
||||
|
||||
|
||||
def main(query: str) -> None:
|
||||
"""入口,读取环境变量并执行搜索。"""
|
||||
city ="101020100"
|
||||
page = 1
|
||||
page_size = 15
|
||||
use_http2 = True
|
||||
pages = 3
|
||||
db_path = _get_db_path()
|
||||
_init_db(db_path)
|
||||
if _has_keyword(db_path, query):
|
||||
print(json.dumps({"skip": True, "keyword": query}, ensure_ascii=False))
|
||||
return
|
||||
for p in range(page, page + pages):
|
||||
_sleep_between_requests(0.3, 0.8)
|
||||
result = call(query=query, city=city, page=p, page_size=page_size, use_http2=use_http2)
|
||||
if isinstance(result, dict):
|
||||
raw = json.dumps({"page": p, "data": result}, ensure_ascii=False)
|
||||
print(raw)
|
||||
else:
|
||||
raw = str(result)
|
||||
print(raw)
|
||||
time.sleep(random.uniform(10, 20))
|
||||
try:
|
||||
_save_raw_response(db_path, query, p, raw)
|
||||
except Exception as e:
|
||||
print(f"Error saving raw response for {query} page {p}: {e}")
|
||||
|
||||
|
||||
|
||||
def _get_db_path() -> str:
|
||||
"""返回默认 SQLite 数据库文件路径。"""
|
||||
base_dir = os.path.dirname(__file__)
|
||||
return os.path.join(base_dir, "boss_raw.sqlite3")
|
||||
|
||||
|
||||
def _init_db(db_path: str) -> None:
|
||||
"""初始化 SQLite 数据库并创建 responses 表。"""
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS responses (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
keyword TEXT NOT NULL,
|
||||
page INTEGER NOT NULL,
|
||||
created_at INTEGER NOT NULL,
|
||||
payload TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _save_raw_response(db_path: str, keyword: str, page: int, raw_payload: str) -> None:
|
||||
"""保存原始响应到 SQLite。"""
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT INTO responses(keyword, page, created_at, payload) VALUES(?, ?, ?, ?)",
|
||||
(keyword, int(page), int(time.time()), raw_payload),
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
def _sleep_between_requests(min_seconds: float = 0.5, max_seconds: float = 1.5) -> None:
|
||||
"""在请求之间进行随机休眠以降低风控风险。
|
||||
|
||||
Args:
|
||||
min_seconds (float): 最少休眠秒数。
|
||||
max_seconds (float): 最大休眠秒数。
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
try:
|
||||
dur = random.uniform(min_seconds, max_seconds)
|
||||
time.sleep(dur)
|
||||
except Exception:
|
||||
time.sleep(min_seconds)
|
||||
|
||||
|
||||
def _has_keyword(db_path: str, keyword: str) -> bool:
|
||||
"""判断指定关键词是否已在数据库中出现过。
|
||||
|
||||
Args:
|
||||
db_path (str): SQLite 数据库路径。
|
||||
keyword (str): 关键词字符串。
|
||||
|
||||
Returns:
|
||||
bool: 若存在记录返回 True,否则 False。
|
||||
"""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT 1 FROM responses WHERE keyword=? LIMIT 1", (keyword,))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
return row is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
keywords = _load_keywords("company.txt")
|
||||
for keyword in _progress_iter(keywords, desc="Keywords", total=len(keywords)):
|
||||
print(keyword)
|
||||
main(keyword)
|
||||
@ -1,68 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
|
||||
|
||||
def _load():
|
||||
base = os.path.dirname(__file__)
|
||||
with open(os.path.join(base, "city.json"), "r", encoding="utf-8") as f:
|
||||
city_data = json.load(f)
|
||||
with open(os.path.join(base, "work.json"), "r", encoding="utf-8") as f:
|
||||
work_data = json.load(f)
|
||||
cities = []
|
||||
if isinstance(city_data, dict) and city_data.get("zpData") and city_data["zpData"].get("cityList"):
|
||||
for c in city_data["zpData"]["cityList"]:
|
||||
cities.append({"code": c.get("code"), "name": c.get("name")})
|
||||
positions = []
|
||||
if isinstance(work_data, dict) and work_data.get("zpData") and work_data["zpData"].get("config"):
|
||||
for cat in work_data["zpData"]["config"]:
|
||||
subs = cat.get("subLevelModelList") or []
|
||||
for sub in subs:
|
||||
subs2 = sub.get("subLevelModelList") or []
|
||||
for pos in subs2:
|
||||
positions.append({"code": pos.get("code"), "name": pos.get("name")})
|
||||
return cities, positions
|
||||
|
||||
|
||||
def _query_name(name: str) -> str:
|
||||
dev_set = {"Java", "Python", "PHP", "C#", "C/C++", "Golang", "Node.js", "Android", "iOS"}
|
||||
if name in dev_set:
|
||||
return f"{name}开发"
|
||||
return name
|
||||
|
||||
|
||||
def enumerate_pairs():
|
||||
cities, positions = _load()
|
||||
pairs = []
|
||||
print(cities)
|
||||
for c in cities:
|
||||
for p in positions:
|
||||
pairs.append({
|
||||
"city_code": c["code"],
|
||||
"city_name": c["name"],
|
||||
"position_code": p["code"],
|
||||
"position_name": p["name"],
|
||||
"query": _query_name(p["name"]),
|
||||
})
|
||||
return pairs
|
||||
|
||||
|
||||
def count_pairs():
|
||||
cities, positions = _load()
|
||||
cities = [c for c in cities if (c.get("name") or "") != "全国"]
|
||||
print(cities)
|
||||
return len(cities) * len(positions)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--exclude-national", action="store_true")
|
||||
args = parser.parse_args()
|
||||
cities, positions = _load()
|
||||
print(cities)
|
||||
if args.exclude_national:
|
||||
cities = [c for c in cities if (c.get("name") or "") != "全国"]
|
||||
city_count = len(cities)
|
||||
position_count = len(positions)
|
||||
total = city_count * position_count
|
||||
print(json.dumps({"cities": city_count, "positions": position_count, "combos": total}, ensure_ascii=False))
|
||||
@ -1,205 +0,0 @@
|
||||
2025-12-15 00:29:23.305 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-15 00:29:23.784 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-15 00:29:23.799 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-15 00:29:36.458 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=15588
|
||||
2025-12-15 00:29:52.503 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:29:52.511 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:29:58.560 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8693
|
||||
2025-12-15 00:30:08.730 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:30:08.740 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:30:26.814 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=19491
|
||||
2025-12-15 00:30:38.342 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:30:38.347 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:30:52.301 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=14544
|
||||
2025-12-15 00:31:02.709 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:31:02.717 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:31:14.741 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9599
|
||||
2025-12-15 00:31:26.538 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:31:26.556 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:31:38.442 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6271
|
||||
2025-12-15 00:31:51.168 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:31:51.172 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:31:58.416 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6511
|
||||
2025-12-15 00:32:04.379 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:32:04.385 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:32:13.535 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=15049
|
||||
2025-12-15 00:32:27.255 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:32:27.260 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:32:42.121 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8828
|
||||
2025-12-15 00:32:55.898 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:32:55.910 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:33:07.429 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6049
|
||||
2025-12-15 00:33:22.839 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:33:22.844 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:33:31.224 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8423
|
||||
2025-12-15 00:33:40.356 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:33:40.361 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:33:44.962 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9488
|
||||
2025-12-15 00:34:02.242 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:34:02.256 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:34:14.243 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5747
|
||||
2025-12-15 00:34:21.270 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:34:21.274 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:34:41.171 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6132
|
||||
2025-12-15 00:35:00.832 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:35:00.846 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:35:12.452 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13477
|
||||
2025-12-15 00:35:29.330 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:35:29.341 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:35:58.348 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=14304
|
||||
2025-12-15 00:36:11.586 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:36:11.595 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:36:16.537 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-15 00:36:31.183 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6403
|
||||
2025-12-15 00:36:42.608 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:36:42.613 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:36:55.674 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13675
|
||||
2025-12-15 00:37:12.157 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:37:12.162 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:37:22.569 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=11202
|
||||
2025-12-15 00:37:37.726 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:37:37.733 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:37:54.948 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9670
|
||||
2025-12-15 00:38:10.138 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:38:10.141 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:38:21.973 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=11575
|
||||
2025-12-15 00:38:25.384 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:38:25.390 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:38:35.538 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=12679
|
||||
2025-12-15 00:38:50.186 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:38:50.192 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:38:59.778 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6067
|
||||
2025-12-15 00:39:07.116 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:39:07.125 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:39:21.684 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=18332
|
||||
2025-12-15 00:39:32.316 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:39:32.323 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:39:42.899 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6725
|
||||
2025-12-15 00:39:58.505 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:39:58.510 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:40:13.074 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9087
|
||||
2025-12-15 00:40:29.155 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:40:29.160 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:40:34.718 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7566
|
||||
2025-12-15 00:40:47.286 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:40:47.291 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:40:52.797 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5322
|
||||
2025-12-15 00:41:05.125 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:41:05.134 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:41:16.720 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6080
|
||||
2025-12-15 00:41:32.930 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:41:32.937 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:41:48.101 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7216
|
||||
2025-12-15 00:41:57.043 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:41:57.053 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:42:14.886 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7975
|
||||
2025-12-15 00:42:25.873 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:42:25.885 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:42:49.378 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5977
|
||||
2025-12-15 00:43:00.350 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:43:00.358 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:43:16.051 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5361
|
||||
2025-12-15 00:43:30.122 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:43:30.130 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:43:41.456 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7612
|
||||
2025-12-15 00:43:56.218 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:43:56.224 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:44:09.073 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5569
|
||||
2025-12-15 00:44:19.391 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:44:19.397 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:44:27.943 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=10078
|
||||
2025-12-15 00:44:33.276 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:44:33.284 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:44:43.176 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5484
|
||||
2025-12-15 00:44:56.390 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:44:56.395 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:45:01.616 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8913
|
||||
2025-12-15 00:45:12.660 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:45:12.668 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:45:30.385 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6853
|
||||
2025-12-15 00:45:41.198 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:45:41.204 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:45:54.589 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6083
|
||||
2025-12-15 00:46:01.018 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:46:01.023 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:46:17.502 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=18314
|
||||
2025-12-15 00:46:29.858 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:46:29.863 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:46:39.637 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6078
|
||||
2025-12-15 00:46:47.242 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:46:47.251 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:47:00.182 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13471
|
||||
2025-12-15 00:47:09.184 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:47:09.191 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:47:23.352 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=10896
|
||||
2025-12-15 00:47:33.324 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:47:33.328 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:47:44.855 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7888
|
||||
2025-12-15 00:47:58.799 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:47:58.804 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:48:06.025 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13870
|
||||
2025-12-15 00:48:35.304 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6570
|
||||
2025-12-15 00:48:50.580 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:48:50.591 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:49:06.970 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7182
|
||||
2025-12-15 00:49:19.145 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:49:19.152 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:49:26.530 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=9248
|
||||
2025-12-15 00:49:36.738 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:49:36.743 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:49:49.563 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8773
|
||||
2025-12-15 00:49:57.434 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:49:57.439 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:50:05.881 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8679
|
||||
2025-12-15 00:50:11.685 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:50:11.689 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:50:19.198 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8517
|
||||
2025-12-15 00:50:27.954 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:50:27.959 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:50:43.572 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8826
|
||||
2025-12-15 00:50:52.327 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:50:52.335 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:51:04.560 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8012
|
||||
2025-12-15 00:51:17.087 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:51:17.091 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:51:28.352 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=12685
|
||||
2025-12-15 00:51:34.970 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:51:34.975 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:51:52.376 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7079
|
||||
2025-12-15 00:52:01.848 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:52:01.855 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:52:19.135 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6340
|
||||
2025-12-15 00:52:30.032 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:52:30.042 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:52:34.956 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=17521
|
||||
2025-12-15 00:52:46.156 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:52:46.161 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:53:00.735 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6173
|
||||
2025-12-15 00:53:12.913 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:53:12.918 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:53:21.232 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7812
|
||||
2025-12-15 00:53:24.212 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:53:24.218 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:53:33.263 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=16922
|
||||
2025-12-15 00:53:44.780 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:53:44.787 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:53:57.646 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-15 00:54:15.890 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=5852
|
||||
2025-12-15 00:54:27.390 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:54:27.397 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:54:39.491 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8181
|
||||
2025-12-15 00:54:52.853 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:54:52.859 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:55:04.783 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=8271
|
||||
2025-12-15 00:55:20.285 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:55:20.291 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:55:27.996 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=6952
|
||||
2025-12-15 00:55:32.327 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:55:32.333 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:55:41.168 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=13433
|
||||
2025-12-15 00:55:55.488 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:55:55.497 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:56:12.980 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=7118
|
||||
2025-12-15 00:56:31.244 | INFO | __main__:push_company:1826 - REPORT_DATA_STATUS {'status': 200, 'size': 116}
|
||||
2025-12-15 00:56:31.252 | INFO | __main__:push_job:1870 - REPORT_DATA_STATUS {'status': 200, 'size': 112}
|
||||
2025-12-15 00:56:38.954 | INFO | __main__:boss_batch_request:1386 - RAW_RESPONSE method=POST url=https://www.zhipin.com/wapi/batch/requests status=200 resp_size=12272
|
||||
@ -1,33 +0,0 @@
|
||||
2025-12-22 00:17:18.388 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:17:18.825 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:17:18.916 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:17:30.421 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:17:40.552 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:17:50.896 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:18:05.607 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:18:10.877 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:18:22.338 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:18:30.282 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:18:44.142 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:19:01.977 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:19:09.058 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:19:16.908 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:19:32.476 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:19:43.911 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:19:51.648 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:20:08.812 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:20:19.945 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:20:27.279 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:20:34.469 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:20:44.545 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=2106a6e5a7d163be4c13..., wt2= ...
|
||||
2025-12-22 00:20:55.698 | ERROR | __main__:load_token_from_api:347 - ❌ API请求失败: 502
|
||||
2025-12-22 00:20:55.698 | WARNING | __main__:load_token_from_api:355 - ⚠️ 使用默认token值
|
||||
2025-12-22 00:21:06.888 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:21:07.390 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:21:07.482 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:21:14.593 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:21:25.726 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:21:38.392 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:21:51.367 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:22:01.894 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=fad715287d5bcad074ca..., wt2= ...
|
||||
2025-12-22 00:22:11.462 | INFO | __main__:load_token_from_api:328 - ✅ 成功从API获取token: id=3, mpt=ff0679d952e35826cced..., wt2= ...
|
||||
@ -1,5 +0,0 @@
|
||||
requests==2.32.4
|
||||
loguru==0.7.3
|
||||
httpx==0.28.1
|
||||
fake-useragent==2.2.0
|
||||
PySocks==1.7.1
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
64312
jobs_spider/qcwy.txt
64312
jobs_spider/qcwy.txt
File diff suppressed because it is too large
Load Diff
@ -1,38 +0,0 @@
|
||||
# 使用Python 3.9作为基础镜像
|
||||
FROM python:3.9-slim
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 设置环境变量
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
|
||||
# 配置环境变量(可通过docker run -e 覆盖)
|
||||
ENV API_BASE_URL=http://124.222.245.240:9999 \
|
||||
MONGODB_URI=mongodb://localhost:27017 \
|
||||
MONGODB_DB=job_data \
|
||||
MAX_PAGES=3 \
|
||||
PAGE_SIZE=15 \
|
||||
MIN_WAIT_TIME=10 \
|
||||
MAX_WAIT_TIME=30 \
|
||||
ERROR_WAIT_MIN=30 \
|
||||
ERROR_WAIT_MAX=60
|
||||
|
||||
# 复制requirements文件并安装Python依赖
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制项目文件
|
||||
COPY boos_api.py ./
|
||||
COPY city.json ./
|
||||
COPY work.json ./
|
||||
|
||||
# 创建非root用户
|
||||
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
|
||||
USER crawler
|
||||
|
||||
# 启动命令
|
||||
CMD ["python", "boos_api.py"]
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,492 +0,0 @@
|
||||
{
|
||||
"abroadFlag": 2,
|
||||
"abroadTipInfo": {
|
||||
"abroadTips": [],
|
||||
"icon": "",
|
||||
"title": ""
|
||||
},
|
||||
"adResponse": None,
|
||||
"aiPositionRecommendLevel": "",
|
||||
"aiPositionRecommendReason": "",
|
||||
"alreadyCallPhone": False,
|
||||
"applyType": "1",
|
||||
"campusBestCompany": {
|
||||
"bestCompanyUrl": "",
|
||||
"homepageType": 0,
|
||||
"logoTagUrl": "",
|
||||
"state": 0
|
||||
},
|
||||
"campusJobDetail": {
|
||||
"applyEndTime": 1767196799999,
|
||||
"applyStartTime": 1760457600000,
|
||||
"applyTimeCountDown": "",
|
||||
"cityName": "内江",
|
||||
"companyLogo": "https://storage-public.zhaopin.cn/campus/newrd/online/1760485043129445931/%25E6%2588%2590%25E6%25B8%259D%25E9%2592%2592%25E9%2592%259B%25E7%25A7%2591%25E6%258A%2580%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8.png",
|
||||
"companyName": "成渝钒钛科技有限公司",
|
||||
"companyNumber": "KA0224051216D90000037000",
|
||||
"countDownBackColor": "",
|
||||
"countDownFontColor": "",
|
||||
"industryName": "钢铁/有色金属冶炼及加工",
|
||||
"orgSizeName": "1000-9999人",
|
||||
"orgTypeName": "民营"
|
||||
},
|
||||
"campusJobMatchData": None,
|
||||
"campusPositionCardTagInfo": None,
|
||||
"campusPreciseMatchType": 0,
|
||||
"campusRootOrgInfo": {
|
||||
"address": "",
|
||||
"businessLicenceName": "中国共产主义青年团四川省委员会",
|
||||
"cityName": "成都",
|
||||
"description": "",
|
||||
"displayOrgName": "中国共产主义青年团四川省委员会",
|
||||
"hideCampusElement": False,
|
||||
"hideJobApplyLable": False,
|
||||
"id": 104514,
|
||||
"industryName": "社团/组织/社会保障",
|
||||
"introUrl": "",
|
||||
"logo": "https://storage-public.zhaopin.cn/campus/newrd/online/1760484389628245179/5baa527cc949cacd676ba77cce6cb79d.png",
|
||||
"oldRdOrgNumber": "",
|
||||
"orgName": "中国共产主义青年团四川省委员会",
|
||||
"orgNumber": "KA0224051216P90000006000",
|
||||
"orgSizeName": "",
|
||||
"orgTypeName": "国家机关",
|
||||
"slaveDisplayOrgName": ""
|
||||
},
|
||||
"canBeRegular": False,
|
||||
"canRemoteInternship": False,
|
||||
"cardCustomJson": "{"address":"内江 威远县","companyName":"成渝钒钛","locationType":"1","salary60":"5000-10000元"}",
|
||||
"cardType": 1,
|
||||
"chatWindow": 1,
|
||||
"cityDistrict": "威远县",
|
||||
"cityId": "809",
|
||||
"commercialLabel": [
|
||||
{
|
||||
"labelDescription": None,
|
||||
"type": 2,
|
||||
"typeName": "网申",
|
||||
"typeShowLabel": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3"
|
||||
},
|
||||
{
|
||||
"labelDescription": None,
|
||||
"type": 27,
|
||||
"typeName": "直招",
|
||||
"typeShowLabel": "https://img09.zhaopin.com/2012/other/mobile/capp/position/home/tag_jd_zhizhao.png?w=78&h=48&r=3"
|
||||
}
|
||||
],
|
||||
"commonTrack": {
|
||||
"trackCommercialFeature": "",
|
||||
"trackSocialSearchEmergencyFeature": False
|
||||
},
|
||||
"companyId": 10097072,
|
||||
"companyLogo": "https://storage-public.zhaopin.cn/campus/newrd/online/1760485043129445931/%25E6%2588%2590%25E6%25B8%259D%25E9%2592%2592%25E9%2592%259B%25E7%25A7%2591%25E6%258A%2580%25E6%259C%2589%25E9%2599%2590%25E5%2585%25AC%25E5%258F%25B8.png",
|
||||
"companyName": "成渝钒钛科技有限公司",
|
||||
"companyNumber": "KA0224051216D90000037000",
|
||||
"companyRootId": 104514,
|
||||
"companyScaleTypeTagsNew": [],
|
||||
"companySize": "",
|
||||
"companyUrl": "https://xiaoyuan.zhaopin.com/company/KA0224051216P90000006000",
|
||||
"complainFlag": False,
|
||||
"deliveryPath": "",
|
||||
"displayPhoneNumber": False,
|
||||
"distance": 0.0,
|
||||
"distanceFormat": "",
|
||||
"distanceText": "",
|
||||
"education": "本科",
|
||||
"experimentInfo": None,
|
||||
"extensions": None,
|
||||
"featureServer": {
|
||||
"jdViews3d": "33",
|
||||
"lastReplyTimeText": "",
|
||||
"staffReplyRate30d": 0.0,
|
||||
"todayReplyNum": 0,
|
||||
"todayReplyNumText": ""
|
||||
},
|
||||
"feedOperation": None,
|
||||
"feedPosition": None,
|
||||
"financingStage": {
|
||||
"name": ""
|
||||
},
|
||||
"firstPublishTime": "2025-10-15 07:52:28",
|
||||
"hasAppliedPosition": False,
|
||||
"industryCompanyTags": [
|
||||
"1400020000"
|
||||
],
|
||||
"industryName": "钢铁/有色金属冶炼及加工",
|
||||
"innerBusinessInfo": {
|
||||
"customIndustryList": []
|
||||
},
|
||||
"internshipMonths": 0,
|
||||
"isNewPosition": 0,
|
||||
"jdCardType": 2,
|
||||
"jobDetailData": {
|
||||
"company": {
|
||||
"base": None,
|
||||
"companyComment": None,
|
||||
"companyInterview": None,
|
||||
"jumpDetail": None,
|
||||
"orgBestRanking": None,
|
||||
"orgReliableCompany": None,
|
||||
"other": None,
|
||||
"state": None
|
||||
},
|
||||
"companyProxy": {
|
||||
"companyAddress": "",
|
||||
"companyImage": "",
|
||||
"companyName": "",
|
||||
"companySize": "",
|
||||
"entryCompanyTitle": ""
|
||||
},
|
||||
"customAttributeInfo": {
|
||||
"platformRemind": "",
|
||||
"reportItems": [],
|
||||
"welfareItems": [],
|
||||
"workTimeItems": []
|
||||
},
|
||||
"debug": {},
|
||||
"experimentInfo": {
|
||||
"blueCollarJobTitleExperimentInfo": None
|
||||
},
|
||||
"featureServer": None,
|
||||
"imSection": None,
|
||||
"internship": [],
|
||||
"live": {
|
||||
"liveItems": [],
|
||||
"liveQuickFocusChecked": 0,
|
||||
"liveQuickFocusState": 0,
|
||||
"recommendLiveList": [],
|
||||
"state": 0
|
||||
},
|
||||
"operationSection": {
|
||||
"topJobBannerArea": None
|
||||
},
|
||||
"partTime": [],
|
||||
"position": {
|
||||
"base": {
|
||||
"deliveryPath": "",
|
||||
"education": "本科",
|
||||
"educationCode": "",
|
||||
"maxSalary": "",
|
||||
"minSalary": "",
|
||||
"positionId": 40846760303,
|
||||
"positionName": "技术操作工",
|
||||
"positionNumber": "CC224051210J40846760303",
|
||||
"positionUrl": "",
|
||||
"positionWorkingExp": "无经验",
|
||||
"positionWorkingExpCode": "",
|
||||
"propertyType": "",
|
||||
"salary": "5000-10000元",
|
||||
"salaryReal": "",
|
||||
"workType": "全职"
|
||||
},
|
||||
"date": {
|
||||
"dateEnd": "",
|
||||
"dateStart": "",
|
||||
"firstPublishTime": "",
|
||||
"positionPublishTime": "",
|
||||
"positionUpdateTime": "",
|
||||
"positionUpdateTimeText": ""
|
||||
},
|
||||
"desc": {
|
||||
"description": "",
|
||||
"descriptionHighlight": "",
|
||||
"highlightLabels": [],
|
||||
"labels": [],
|
||||
"performanceBonus": "",
|
||||
"welfareLabel": [],
|
||||
"welfareTags": []
|
||||
},
|
||||
"jobType": {
|
||||
"jobType": "",
|
||||
"jobTypeLevel": "15000100000000",
|
||||
"jobTypeLevelName": "",
|
||||
"subJobType": "",
|
||||
"subJobTypeLevel": "15000100190000",
|
||||
"subJobTypeLevelName": ""
|
||||
},
|
||||
"onlineCarHailingExtend": {
|
||||
"gray": False,
|
||||
"promiseGuarantee": ""
|
||||
},
|
||||
"onlineCarInfo": [],
|
||||
"other": {
|
||||
"customJobGroup": "NORMAL_DIRECT",
|
||||
"deliveredPreviouslyTip": "",
|
||||
"jobKeyword": {
|
||||
"keywords": []
|
||||
},
|
||||
"jobSkillTags": [],
|
||||
"jobTypeIsBlueCollar": True,
|
||||
"pageStyle": 0,
|
||||
"positionCommercialLabel": [
|
||||
{
|
||||
"labelDescription": None,
|
||||
"type": 2,
|
||||
"typeName": "网申",
|
||||
"typeShowLabel": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3"
|
||||
},
|
||||
{
|
||||
"labelDescription": None,
|
||||
"type": 27,
|
||||
"typeName": "直招",
|
||||
"typeShowLabel": "https://img09.zhaopin.com/2012/other/mobile/capp/position/home/tag_jd_zhizhao.png?w=78&h=48&r=3"
|
||||
}
|
||||
],
|
||||
"positionHighlight": "",
|
||||
"propertyTypeUrl": "",
|
||||
"rpoProxyDisplayOrgName": "",
|
||||
"urgentRecruitmentUrl": ""
|
||||
},
|
||||
"preferredHrInfo": {
|
||||
"icon": "",
|
||||
"introduce": "",
|
||||
"jumpUrl": "",
|
||||
"preferredHr": False
|
||||
},
|
||||
"todayInterview": None,
|
||||
"workLocation": {
|
||||
"address": "工作地点:内江 · 威远县",
|
||||
"addressType": 0,
|
||||
"latitude": "0",
|
||||
"longitude": "0",
|
||||
"positionCityDistrict": "",
|
||||
"positionCityDistrictCode": "",
|
||||
"positionCityId": "809",
|
||||
"positionWorkCity": "",
|
||||
"showMap": True,
|
||||
"showMultiAddressesTip": "",
|
||||
"staticMapUrl": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/detail/defaultMapUrl.png?w=1230&h=720&r=3",
|
||||
"streetName": "",
|
||||
"tradingArea": "",
|
||||
"travelMode": "bus",
|
||||
"verifyTheTruthUrl": "",
|
||||
"workAddress": "内江威远县连界镇连界工业园区"
|
||||
}
|
||||
},
|
||||
"proxyWarning": None,
|
||||
"recommender": {
|
||||
"avatar": "",
|
||||
"name": "",
|
||||
"state": 0,
|
||||
"text": "",
|
||||
"title": ""
|
||||
},
|
||||
"secure": {
|
||||
"abroadLabel": "",
|
||||
"abroadTipInfo": None,
|
||||
"safeCenter": None,
|
||||
"safetyReminder": None
|
||||
},
|
||||
"staff": {
|
||||
"activityLevel": [],
|
||||
"auditNaturePrompt": None,
|
||||
"authenticationState": 0,
|
||||
"avatar": "http://img09.zhaopin.cn/2012/other/mobile/position/list/hr_88.png",
|
||||
"companyName": "成渝钒钛科技有限公司",
|
||||
"goldMedalInterviewer": None,
|
||||
"greeting": "",
|
||||
"greetingHasDelivery": "",
|
||||
"hrJob": "人事经理",
|
||||
"hrOnlineIocState": 0,
|
||||
"hrOnlineState": "",
|
||||
"hrResumeOperationState": "",
|
||||
"hrStateInfo": "",
|
||||
"id": 1212725485,
|
||||
"lastOnlineTime": 0,
|
||||
"lastOnlineTimeText": "",
|
||||
"modularState": 1,
|
||||
"other": {
|
||||
"freeTag": None,
|
||||
"tagUrl": ""
|
||||
},
|
||||
"positionDetailStaffQuickReply": None,
|
||||
"staffName": "HR"
|
||||
},
|
||||
"stateInfo": {
|
||||
"deliveryAfterGuide": None,
|
||||
"imSessionInfoDetail": {
|
||||
"imChatStatus": 0,
|
||||
"imChatStatusForChatBeforeDelivery": 0,
|
||||
"imDeliveryTitle": "",
|
||||
"referType": -1
|
||||
},
|
||||
"positionBehaviorState": {
|
||||
"deliveryState": 0,
|
||||
"favoriteState": 0,
|
||||
"followHrState": 0,
|
||||
"imReplyState": -1,
|
||||
"negativeState": 0,
|
||||
"sessionChatState": 0
|
||||
},
|
||||
"state": {
|
||||
"abroadFlag": 2,
|
||||
"applyType": "",
|
||||
"callProcess": "",
|
||||
"hasAppliedPosition": False,
|
||||
"positionDeliveryType": "",
|
||||
"positionSourceType": 2,
|
||||
"workMode": "ONSITE",
|
||||
"workModeDesc": ""
|
||||
},
|
||||
"useNewAfterDeliveryStyle": False
|
||||
},
|
||||
"verifyTheTruth": None,
|
||||
"verifyTrueFeedback": None
|
||||
},
|
||||
"jobDetailShowUrgentTag": False,
|
||||
"jobHitReason": "",
|
||||
"jobHitReasonHighlights": [],
|
||||
"jobId": 40846760303,
|
||||
"jobKeyword": {
|
||||
"keywords": [
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
"jobKnowledgeWelfareFeatures": [],
|
||||
"jobPostingTime": 1760485948498,
|
||||
"jobRootOrgInfo": {
|
||||
"cityName": "内江"
|
||||
},
|
||||
"jobSkillTags": [],
|
||||
"jobSummary": "负责新产品开发、新技术运用,并对实施过程的质量、技术协调、管理",
|
||||
"liveCard": {
|
||||
"icon": "",
|
||||
"liveState": 0,
|
||||
"liveTips": "",
|
||||
"roomId": 0,
|
||||
"startTimeFormat": "",
|
||||
"videoUrl": ""
|
||||
},
|
||||
"matchInfo": {
|
||||
"icon": "http://img09.zhaopin.cn/2012/other/mobile/position/list/icon_jd_recommend.png?w=156&h=54&r=3",
|
||||
"matched": 1,
|
||||
"tagState": 0
|
||||
},
|
||||
"menVipLevel": 0,
|
||||
"name": "技术操作工",
|
||||
"needMajor": [],
|
||||
"number": "CC224051210J40846760303",
|
||||
"operationImageLabel": [
|
||||
"https://img09.zhaopin.com/2012/other/mobile/app/im/job_protection.png?w=174&h=69&r=3"
|
||||
],
|
||||
"orgBestEmployerFlag": 0,
|
||||
"orgCommercialTags": [],
|
||||
"orgPayedFlag": 0,
|
||||
"organizer": True,
|
||||
"payload": {
|
||||
"name": "",
|
||||
"partition": "",
|
||||
"score": "",
|
||||
"weight": ""
|
||||
},
|
||||
"positionCommercialLabel": [
|
||||
{
|
||||
"labelDescription": None,
|
||||
"type": 2,
|
||||
"typeName": "网申",
|
||||
"typeShowLabel": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3"
|
||||
},
|
||||
{
|
||||
"labelDescription": None,
|
||||
"type": 27,
|
||||
"typeName": "直招",
|
||||
"typeShowLabel": "https://img09.zhaopin.com/2012/other/mobile/capp/position/home/tag_jd_zhizhao.png?w=78&h=48&r=3"
|
||||
}
|
||||
],
|
||||
"positionExpandCardData": "",
|
||||
"positionExpandCardType": 0,
|
||||
"positionHighlight": "",
|
||||
"positionOfNlp": 1,
|
||||
"positionSourceType": 2,
|
||||
"positionSourceTypeUrl": "https://img09.zhaopin.cn/2012/other/mobile/capp/position/ui23/tag_jd_xiao_3x.png?w=81&h=48&r=3",
|
||||
"positionURL": "https://xiaoyuan.zhaopin.com/job/CC224051210J40846760303",
|
||||
"positionUrl": "https://xiaoyuan.zhaopin.com/job/CC224051210J40846760303",
|
||||
"property": "民营",
|
||||
"propertyCode": "5",
|
||||
"propertyName": "国家机关",
|
||||
"propertyType": "",
|
||||
"propertyTypeUrl": "",
|
||||
"provideInternshipCertificate": False,
|
||||
"proxyModel": {
|
||||
"proxiedOrgName": "",
|
||||
"proxiedOrgSize": "",
|
||||
"recruitPosition": 0
|
||||
},
|
||||
"publishTime": "2025-10-15 07:52:28",
|
||||
"recallSign": {
|
||||
"gMethod": "config-position_search-position_campus_query-COMPANY-fix_20250109",
|
||||
"gParam": "query-ps-campus-query-1",
|
||||
"gQuery": "query-ps-campus-query-1",
|
||||
"gSort": "query-ps-campus-query-1",
|
||||
"gSource": "solr.source_position_query",
|
||||
"gWeight": 20
|
||||
},
|
||||
"recruitNumber": 0,
|
||||
"redirectUrl": "",
|
||||
"redirectable": False,
|
||||
"rootCompanyNumber": "KA0224051216P90000006000",
|
||||
"rpoProxied": False,
|
||||
"rpoProxy": False,
|
||||
"salary60": "5000-10000元",
|
||||
"salaryCount": "",
|
||||
"salaryReal": "5000-10000",
|
||||
"salaryType": 1,
|
||||
"searchTagList": [],
|
||||
"securityAddressLabel": "",
|
||||
"settlementType": "",
|
||||
"showDistance": 0,
|
||||
"showSkillTags": [
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "本科"
|
||||
}
|
||||
],
|
||||
"skillLabel": [],
|
||||
"skillLabelPersonality": "",
|
||||
"staffCard": {
|
||||
"authenticationState": 0,
|
||||
"avatar": "http://img09.zhaopin.cn/2012/other/mobile/position/list/hr_88.png",
|
||||
"goldMedalInterviewer": {
|
||||
"goldMedalInterviewer": False,
|
||||
"interviewerImageUrl": "",
|
||||
"interviewerTitle": ""
|
||||
},
|
||||
"hrCompanyName": "",
|
||||
"hrJob": "人事经理",
|
||||
"hrOnlineIocState": 0,
|
||||
"hrOnlineState": "",
|
||||
"hrStateInfo": "",
|
||||
"id": 1212725485,
|
||||
"lastOnlineTime": 1762492035114,
|
||||
"lastOnlineTimeText": "",
|
||||
"staffName": "HR"
|
||||
},
|
||||
"streetId": 0,
|
||||
"streetName": "",
|
||||
"subJobTypeLevel": "15000100190000",
|
||||
"subJobTypeLevelName": "技工",
|
||||
"subways": [],
|
||||
"tagABC": "",
|
||||
"tagList": [],
|
||||
"todayInterview": False,
|
||||
"todayInterviewImageUrl": "",
|
||||
"topLabel": None,
|
||||
"tradingArea": "",
|
||||
"volcanoMeterial": None,
|
||||
"weeklyInternshipDays": 0,
|
||||
"welfareLabel": [],
|
||||
"welfareTagList": [],
|
||||
"workCity": "内江",
|
||||
"workDateType": "",
|
||||
"workMode": "",
|
||||
"workType": "全职",
|
||||
"workingExp": "无经验"
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
@ -1,6 +0,0 @@
|
||||
{"timestamp": "2026-01-15 00:38:02", "total_crawled": 517, "unique_count": 503, "duplicate_count": 14, "api_total_count": 505, "job_area": "商丘", "function_type": "8305"}
|
||||
{"timestamp": "2026-01-15 01:36:23", "total_crawled": 517, "unique_count": 509, "duplicate_count": 8, "api_total_count": 517, "job_area": "广安", "function_type": "1318"}
|
||||
{"timestamp": "2026-01-15 02:32:36", "total_crawled": 517, "unique_count": 511, "duplicate_count": 6, "api_total_count": 517, "job_area": "阜阳", "function_type": "6101"}
|
||||
{"timestamp": "2026-01-15 03:32:52", "total_crawled": 517, "unique_count": 513, "duplicate_count": 4, "api_total_count": 517, "job_area": "常德", "function_type": "3812"}
|
||||
{"timestamp": "2026-01-15 04:31:42", "total_crawled": 517, "unique_count": 510, "duplicate_count": 7, "api_total_count": 517, "job_area": "惠州", "function_type": "3335"}
|
||||
{"timestamp": "2026-01-15 05:28:54", "total_crawled": 517, "unique_count": 515, "duplicate_count": 2, "api_total_count": 517, "job_area": "锦州", "function_type": "0154"}
|
||||
@ -1,21 +0,0 @@
|
||||
2025-12-15 00:55:26.469 | INFO | __main__:crawl_recommend_jobs_main:911 - 已配置代理: http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818
|
||||
2025-12-15 00:55:26.502 | INFO | __main__:_init_client:323 - 初始化客户端,使用代理: http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818
|
||||
2025-12-15 00:55:26.601 | INFO | __main__:crawl_recommend_jobs_main:943 - 随机选择工作类型: 货运司机 (1831)
|
||||
2025-12-15 00:55:26.601 | INFO | __main__:crawl_recommend_jobs_main:945 - 开始爬取推荐职位,最多 3 页
|
||||
2025-12-15 00:55:26.601 | INFO | __main__:crawl_multiple_pages:803 - 正在爬取第 1 页...
|
||||
2025-12-15 00:55:26.601 | INFO | __main__:_init_client:323 - 初始化客户端,使用代理: http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818
|
||||
2025-12-15 00:55:26.607 | INFO | __main__:get_recommend_jobs:512 - 获取推荐职位: 页码=1, 页大小=10, 地区=190300, 工作类型=1831
|
||||
2025-12-15 00:55:26.694 | WARNING | __main__:_make_request:477 - 请求失败: 500 - {"status":"100000","message":"网络超时,请稍后重试!"}
|
||||
2025-12-15 00:55:27.728 | WARNING | __main__:_make_request:477 - 请求失败: 500 - {"status":"100000","message":"网络超时,请稍后重试!"}
|
||||
2025-12-15 00:55:29.766 | WARNING | __main__:_make_request:477 - 请求失败: 500 - {"status":"100000","message":"网络超时,请稍后重试!"}
|
||||
2025-12-15 00:55:29.767 | ERROR | __main__:_make_request:503 - 所有重试失败,请求终止
|
||||
2025-12-15 00:55:30.063 | INFO | __main__:_make_request:474 - 请求参数 method=POST url=https://cupid.51job.com/open/noauth/recommend/job-tab-dynamic-wx-mini?api_key=51job×tamp=1765731326 status=200 resp_size=52176
|
||||
2025-12-15 00:55:30.064 | INFO | __main__:get_recommend_jobs:585 - 成功获取 10 个职位
|
||||
2025-12-15 00:55:30.131 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/jobs/detail/base/169639008?api_key=51job×tamp=1765731330 status=200 resp_size=5614
|
||||
2025-12-15 00:55:30.202 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/company-info/info-data?api_key=51job×tamp=1765731330&companyId=10080947&colorOne=%23ffffff&colorTwo=%23ffffffcc status=200 resp_size=4186
|
||||
2025-12-15 00:55:31.599 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/jobs/detail/base/169747067?api_key=51job×tamp=1765731331 status=200 resp_size=5105
|
||||
2025-12-15 00:55:31.660 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/company-info/info-data?api_key=51job×tamp=1765731331&companyId=9427865&colorOne=%23ffffff&colorTwo=%23ffffffcc status=200 resp_size=2743
|
||||
2025-12-15 00:55:34.376 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/jobs/detail/base/162834418?api_key=51job×tamp=1765731334 status=200 resp_size=5231
|
||||
2025-12-15 00:55:34.451 | INFO | __main__:_make_request:474 - 请求参数 method=GET url=https://cupid.51job.com/open/noauth/company-info/info-data?api_key=51job×tamp=1765731334&companyId=9371694&colorOne=%23ffffff&colorTwo=%23ffffffcc status=200 resp_size=2524
|
||||
2025-12-15 00:55:35.084 | INFO | __main__:crawl_recommend_jobs_main:964 - 用户中断推荐职位爬取
|
||||
2025-12-15 00:55:35.085 | INFO | __main__:close:878 - httpx客户端已关闭
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,21 +0,0 @@
|
||||
{
|
||||
'status': '1',
|
||||
'message': '成功',
|
||||
'resultbody': {
|
||||
'jobList': {
|
||||
'items': [],
|
||||
'totalCount': 378,
|
||||
'listShowTypeFuncTypeCode': '',
|
||||
'quickDeliveryFuncTypeCode': '',
|
||||
'salaryExpandEnabled': True,
|
||||
'needTransition': False,
|
||||
'totalcount': 378
|
||||
},
|
||||
'adsTabFeeds': [
|
||||
],
|
||||
'requestId': '9e791cba2cce6ac408fc29282bfe1927_7ebb7d6e43dbebc23473123df02328d4',
|
||||
'beyondDistancePosition': -1,
|
||||
'jobLibCodes': [],
|
||||
'jobAreaList': []
|
||||
}
|
||||
}
|
||||
@ -1,231 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import hmac
|
||||
import hashlib
|
||||
import random
|
||||
from typing import Any, Dict, List, Optional
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
import urllib.parse
|
||||
import urllib3
|
||||
|
||||
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
|
||||
BASE_URL = "https://we.51job.com/api/job/search-pc"
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
|
||||
SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||||
|
||||
|
||||
def _now_ts() -> str:
|
||||
"""返回当前时间戳字符串"""
|
||||
return str(int(time.time()))
|
||||
|
||||
|
||||
def _build_proxy() -> Dict[str, str]:
|
||||
"""构造HTTP/HTTPS代理配置"""
|
||||
url = os.getenv("PROXY_URL", "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818")
|
||||
return {"http": url, "https": url}
|
||||
|
||||
|
||||
def _hmac_sign_url(url: str) -> str:
|
||||
"""对完整URL进行HMAC-SHA256签名"""
|
||||
return hmac.new(SIGN_KEY.encode("utf-8"), url.encode("utf-8"), hashlib.sha256).hexdigest()
|
||||
|
||||
|
||||
def _generate_acw_sc_v2(arg1: str) -> str:
|
||||
"""依据arg1生成acw_sc__v2值"""
|
||||
if not arg1:
|
||||
return ""
|
||||
pos_list = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21,
|
||||
32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36]
|
||||
mask = "3000176000856006061501533003690027800375"
|
||||
out = [None] * len(pos_list)
|
||||
for i in range(len(arg1)):
|
||||
for j in range(len(pos_list)):
|
||||
if pos_list[j] == i + 1:
|
||||
out[j] = arg1[i]
|
||||
break
|
||||
arg2 = ''.join([c for c in out if c])
|
||||
arg3 = ""
|
||||
for i in range(0, min(len(arg2), len(mask)), 2):
|
||||
sc = int(arg2[i:i + 2], 16)
|
||||
mc = int(mask[i:i + 2], 16)
|
||||
arg3 += format(sc ^ mc, '02x')
|
||||
return arg3
|
||||
|
||||
|
||||
def _build_web_headers(keyword: str, did: str, sign: str) -> Dict[str, str]:
|
||||
"""构造网页接口所需请求头"""
|
||||
encoded_kw = quote(keyword, safe="")
|
||||
referer = f"https://we.51job.com/pc/search?jobArea=020000&keyword={encoded_kw}&searchType=2&keywordType="
|
||||
web_prop = {
|
||||
"partner": "",
|
||||
"webId": 2,
|
||||
"fromdomain": "51job_web",
|
||||
"frompageUrl": "https://we.51job.com/",
|
||||
"pageUrl": referer,
|
||||
"identityType": "",
|
||||
"userType": "",
|
||||
"isLogin": "否",
|
||||
"accountid": "",
|
||||
"keywordType": "直接输入",
|
||||
}
|
||||
return {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"from-domain": "51job_web",
|
||||
"priority": "u=1, i",
|
||||
"property": quote(json.dumps(web_prop, ensure_ascii=False), safe=""),
|
||||
"referer": referer,
|
||||
"sec-ch-ua": '"Not/A)Brand";v="8", "Chromium";v="136", "Google Chrome";v="136"',
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-ch-ua-platform": '"macOS"',
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-origin",
|
||||
"sign": sign,
|
||||
"uuid": did,
|
||||
"Cookie": f"guid={did}",
|
||||
}
|
||||
|
||||
|
||||
def _build_query(city_code: str, page: int, keyword: str) -> str:
|
||||
"""构造search-pc查询串"""
|
||||
params = {
|
||||
'api_key': '51job',
|
||||
'timestamp': _now_ts(),
|
||||
'keyword': keyword,
|
||||
'searchType': '2',
|
||||
'function': '',
|
||||
'industry': '',
|
||||
'jobArea': city_code,
|
||||
'jobArea2': '',
|
||||
'landmark': '',
|
||||
'metro': '',
|
||||
'salary': '',
|
||||
'workYear': '',
|
||||
'degree': '',
|
||||
'companyType': '',
|
||||
'companySize': '',
|
||||
'jobType': '',
|
||||
'issueDate': '',
|
||||
'sortType': '0',
|
||||
'pageNum': str(page),
|
||||
'requestId': '',
|
||||
'pageSize': '20',
|
||||
'source': '1',
|
||||
'accountId': '',
|
||||
'pageCode': 'sou|sou|soulb',
|
||||
'scene': '7',
|
||||
}
|
||||
return urllib.parse.urlencode(params, quote_via=urllib.parse.quote)
|
||||
|
||||
|
||||
def _prefetch_acw(city_code: str, page: int, keyword: str, proxies: Dict[str, str]) -> Optional[str]:
|
||||
"""预取arg1并生成acw_sc__v2"""
|
||||
qs = _build_query(city_code, page, keyword)
|
||||
full_url = f"{BASE_URL}?{qs}"
|
||||
did = str(random.randint(10**15, 10**16-1))
|
||||
sign = _hmac_sign_url(full_url)
|
||||
headers = _build_web_headers(keyword, did, sign)
|
||||
try:
|
||||
resp = requests.get(full_url, headers=headers, timeout=20, proxies=proxies, verify=False)
|
||||
txt = resp.text or ""
|
||||
m = re_search_arg1(txt)
|
||||
if not m:
|
||||
return None
|
||||
return _generate_acw_sc_v2(m)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def re_search_arg1(text: str) -> Optional[str]:
|
||||
"""从文本中提取arg1"""
|
||||
import re
|
||||
m = re.search(r"var\s+arg1\s*=\s*['\"]([^'\"]+)['\"]", text)
|
||||
return m.group(1) if m else None
|
||||
|
||||
|
||||
def fetch_page(city_code: str, keyword: str, page: int, proxies: Dict[str, str]) -> Dict[str, Any]:
|
||||
"""抓取单页数据并返回解析后的JSON或文本封装"""
|
||||
qs = _build_query(city_code, page, keyword)
|
||||
full_url = f"{BASE_URL}?{qs}"
|
||||
did = str(random.randint(10**15, 10**16-1))
|
||||
sign = _hmac_sign_url(full_url)
|
||||
headers = _build_web_headers(keyword, did, sign)
|
||||
acw = _prefetch_acw(city_code, page, keyword, proxies)
|
||||
if acw:
|
||||
headers["Cookie"] = headers.get("Cookie", "") + f"; acw_sc__v2={acw}"
|
||||
resp = requests.get(full_url, headers=headers, timeout=30, proxies=proxies, verify=False)
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return {"raw": resp.text}
|
||||
|
||||
|
||||
def load_company_keywords() -> List[str]:
|
||||
"""读取同目录company.txt为关键词列表"""
|
||||
fp = os.path.join(os.path.dirname(__file__), "company.txt")
|
||||
try:
|
||||
with open(fp, "r", encoding="utf-8") as f:
|
||||
lines = [ln.strip() for ln in f.readlines()]
|
||||
return [ln for ln in lines if ln]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def fetch_service_params() -> Optional[Dict[str, Any]]:
|
||||
"""从服务端获取当天未使用的检索条件并占用
|
||||
|
||||
返回:
|
||||
dict: {"city_code": str, "keyword": str} 或 None
|
||||
"""
|
||||
try:
|
||||
url = f"{API_BASE_URL}/api/v1/keyword/available"
|
||||
r = requests.get(url, params={"source": "qcwy", "limit": 1}, timeout=10)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
js = r.json()
|
||||
data = js.get("data") or {}
|
||||
items = data.get("items") or []
|
||||
if not items:
|
||||
return None
|
||||
item = items[0]
|
||||
ids = [item.get("id")]
|
||||
if ids and ids[0]:
|
||||
try:
|
||||
murl = f"{API_BASE_URL}/api/v1/keyword/mark-used"
|
||||
requests.post(murl, json={"source": "qcwy", "ids": ids}, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
return {"city_code": str(item.get("city", "")), "keyword": str(item.get("job", ""))}
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
def run(city_code: str = "020000", max_pages: int = 3) -> None:
|
||||
"""执行基于company.txt的固定条件搜索并输出返回数据"""
|
||||
proxies = _build_proxy()
|
||||
# 优先从服务端拉取城市与关键词;无数据时回退到本地company.txt
|
||||
svc = fetch_service_params()
|
||||
keywords = [svc["keyword"]] if svc else load_company_keywords()
|
||||
city_code = svc["city_code"] if svc else city_code
|
||||
for kw in keywords:
|
||||
for p in range(1, max_pages + 1):
|
||||
data = fetch_page(city_code, kw, p, proxies)
|
||||
payload = {
|
||||
"keyword": kw,
|
||||
"page": p,
|
||||
"data": data,
|
||||
}
|
||||
print(json.dumps(payload, ensure_ascii=False))
|
||||
time.sleep(random.uniform(0.8, 1.6))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
@ -1,974 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid as _uuid
|
||||
from typing import Any, Dict, Optional, Callable
|
||||
import socket
|
||||
import random
|
||||
import sqlite3
|
||||
from urllib.parse import quote
|
||||
from urllib.request import Request, urlopen, build_opener, ProxyHandler, HTTPSHandler
|
||||
from urllib.error import HTTPError, URLError
|
||||
import ssl
|
||||
|
||||
|
||||
BASE_URL = "https://cupid.51job.com"
|
||||
SIGN_KEY = os.getenv(
|
||||
"JOB_SIGN_KEY",
|
||||
"abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b",
|
||||
)
|
||||
FROM_DOMAIN = "51job_weixin_wxapp"
|
||||
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
|
||||
COMPANY_INFO_CACHE: Dict[str, Any] = {}
|
||||
SUCCESS_LOG_PATH = os.path.join(os.path.dirname(__file__), "success.txt")
|
||||
_SUCCESS_WRITTEN: set = set()
|
||||
|
||||
|
||||
def _timestamp() -> int:
|
||||
"""Get current UNIX timestamp.
|
||||
|
||||
Returns:
|
||||
int: Current timestamp in seconds.
|
||||
"""
|
||||
|
||||
return int(time.time())
|
||||
|
||||
|
||||
def _encode_query(params: Optional[Dict[str, object]]) -> str:
|
||||
"""Encode query parameters preserving insertion order.
|
||||
|
||||
Args:
|
||||
params (Optional[Dict[str, object]]): Query parameters to encode.
|
||||
|
||||
Returns:
|
||||
str: URL-encoded query string starting with '&' when params exist.
|
||||
"""
|
||||
|
||||
if not params:
|
||||
return ""
|
||||
pieces = []
|
||||
for k, v in params.items():
|
||||
key = quote(str(k), safe="")
|
||||
if isinstance(v, (list, tuple)):
|
||||
for item in v:
|
||||
pieces.append(f"{key}={quote(str(item), safe='')}")
|
||||
elif v is None:
|
||||
pieces.append(f"{key}=")
|
||||
else:
|
||||
pieces.append(f"{key}={quote(str(v), safe='')}")
|
||||
return "&" + "&".join(pieces)
|
||||
|
||||
|
||||
def build_signature(
|
||||
method: str,
|
||||
path: str,
|
||||
query_params: Optional[Dict[str, object]] = None,
|
||||
body_json: Optional[str] = None,
|
||||
timestamp: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Construct signing string and compute HMAC-SHA256 signature.
|
||||
|
||||
Args:
|
||||
method (str): HTTP method (GET or POST).
|
||||
path (str): API path without leading slash.
|
||||
query_params (Optional[Dict[str, object]]): Query parameters for GET.
|
||||
body_json (Optional[str]): JSON body string for POST.
|
||||
timestamp (Optional[int]): Provided timestamp; generates if None.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: dict with fields 'sig'(hex), 'signed_path', and 'ts'.
|
||||
"""
|
||||
|
||||
import hmac
|
||||
import hashlib
|
||||
|
||||
ts = timestamp or _timestamp()
|
||||
base = f"/{path}?api_key=51job×tamp={ts}"
|
||||
sign_str = base
|
||||
method_u = method.upper()
|
||||
if method_u == "GET":
|
||||
q = _encode_query(query_params)
|
||||
sign_str += q
|
||||
signed_path = base + q
|
||||
else:
|
||||
if body_json:
|
||||
sign_str += body_json
|
||||
signed_path = base
|
||||
key_bytes = SIGN_KEY.encode("utf-8")
|
||||
sig = hmac.new(key_bytes, sign_str.encode("utf-8"), hashlib.sha256).hexdigest()
|
||||
return {"sig": sig, "signed_path": signed_path, "ts": ts}
|
||||
|
||||
|
||||
def _build_headers(
|
||||
sign: str,
|
||||
content_type: str,
|
||||
uuid: Optional[str] = None,
|
||||
account_id: Optional[str] = None,
|
||||
user_token: Optional[str] = None,
|
||||
partner: Optional[str] = None,
|
||||
property_obj: Optional[Dict[str, Any]] = None,
|
||||
headers_ext: Optional[Dict[str, str]] = None,
|
||||
) -> Dict[str, str]:
|
||||
"""Build request headers including signing and context.
|
||||
|
||||
Args:
|
||||
sign (str): Hex-encoded signature.
|
||||
content_type (str): Content-Type header.
|
||||
uuid (Optional[str]): UUID value for tracing.
|
||||
account_id (Optional[str]): Account id.
|
||||
user_token (Optional[str]): User token.
|
||||
partner (Optional[str]): Partner identifier.
|
||||
property_obj (Optional[Dict[str, Any]]): Property payload.
|
||||
headers_ext (Optional[Dict[str, str]]): Extra headers to merge.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Complete headers dict.
|
||||
"""
|
||||
|
||||
did = uuid or str(_uuid.uuid4())
|
||||
headers = {
|
||||
"sign": sign,
|
||||
"From-Domain": FROM_DOMAIN,
|
||||
"Content-Type": content_type,
|
||||
"Accept": "application/json",
|
||||
"uuid": did,
|
||||
}
|
||||
if account_id:
|
||||
headers["account-id"] = account_id
|
||||
if user_token:
|
||||
headers["user-token"] = user_token
|
||||
if partner:
|
||||
headers["partner"] = partner
|
||||
|
||||
prop = property_obj or {
|
||||
"frompageUrl": "",
|
||||
"pageUrl": "",
|
||||
"isLogin": "是" if bool(account_id) else "否",
|
||||
"accountid": account_id or "",
|
||||
"resumeId": "",
|
||||
"firstFrompageUrl": "",
|
||||
"distinct_id": did,
|
||||
}
|
||||
headers["property"] = quote(json.dumps(prop, ensure_ascii=False), safe="")
|
||||
if headers_ext:
|
||||
headers.update(headers_ext)
|
||||
return headers
|
||||
|
||||
|
||||
def _request(
|
||||
method: str,
|
||||
path: str,
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
body: Optional[Dict[str, Any]] = None,
|
||||
uuid: Optional[str] = None,
|
||||
account_id: Optional[str] = None,
|
||||
user_token: Optional[str] = None,
|
||||
partner: Optional[str] = None,
|
||||
property_obj: Optional[Dict[str, Any]] = None,
|
||||
headers_ext: Optional[Dict[str, str]] = None,
|
||||
proxies: Optional[list] = None,
|
||||
timeout: int = 10,
|
||||
retries: int = 2,
|
||||
raw_sink: Optional[Callable[[str], None]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Execute signed HTTP request with basic retries.
|
||||
|
||||
Args:
|
||||
method (str): HTTP method.
|
||||
path (str): API path without leading slash.
|
||||
params (Optional[Dict[str, Any]]): Query for GET.
|
||||
body (Optional[Dict[str, Any]]): JSON body for POST.
|
||||
uuid/account_id/user_token/partner/property_obj: Header context.
|
||||
headers_ext (Optional[Dict[str, str]]): Extra headers to merge.
|
||||
proxies (Optional[list]): Proxy entries.
|
||||
timeout (int): Timeout seconds.
|
||||
retries (int): Retry attempts.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Parsed JSON response.
|
||||
"""
|
||||
|
||||
body_json = json.dumps(body, ensure_ascii=False) if body is not None else None
|
||||
sig = build_signature(method, path, params, body_json)
|
||||
content_type = "application/x-www-form-urlencoded" if method.upper() == "GET" else "application/json"
|
||||
headers = _build_headers(
|
||||
sign=sig["sig"],
|
||||
content_type=content_type,
|
||||
uuid=uuid,
|
||||
account_id=account_id,
|
||||
user_token=user_token,
|
||||
partner=partner,
|
||||
property_obj=property_obj,
|
||||
headers_ext=headers_ext,
|
||||
)
|
||||
url = f"{BASE_URL}{sig['signed_path']}"
|
||||
data_bytes = body_json.encode("utf-8") if (method.upper() == "POST" and body_json is not None) else None
|
||||
|
||||
attempt = 0
|
||||
backoff = 0.5
|
||||
last_error: Optional[Exception] = None
|
||||
use_insecure_ssl = os.getenv("JOB_INSECURE_SSL") in ("1", "true", "TRUE")
|
||||
while attempt <= retries:
|
||||
_sleep_between_requests(0.2, 0.7)
|
||||
req = Request(url=url, data=data_bytes, headers=headers, method=method.upper())
|
||||
try:
|
||||
opener = None
|
||||
ctx = _get_ssl_context(use_insecure_ssl)
|
||||
if proxies:
|
||||
idx = attempt % len(proxies)
|
||||
p = proxies[idx]
|
||||
if isinstance(p, str):
|
||||
ph = ProxyHandler({"http": p, "https": p})
|
||||
elif isinstance(p, dict):
|
||||
ph = ProxyHandler(p)
|
||||
else:
|
||||
ph = None
|
||||
if ph:
|
||||
opener = build_opener(ph, HTTPSHandler(context=ctx))
|
||||
if opener:
|
||||
with opener.open(req, timeout=timeout) as resp:
|
||||
payload = resp.read().decode("utf-8")
|
||||
else:
|
||||
with urlopen(req, timeout=timeout, context=ctx) as resp:
|
||||
payload = resp.read().decode("utf-8")
|
||||
if raw_sink and isinstance(payload, str):
|
||||
try:
|
||||
raw_sink(payload)
|
||||
except Exception:
|
||||
pass
|
||||
return json.loads(payload) if payload else {}
|
||||
except (HTTPError, URLError) as e:
|
||||
print(e)
|
||||
last_error = e
|
||||
msg = str(e)
|
||||
if ("CERTIFICATE_VERIFY_FAILED" in msg) and (not use_insecure_ssl):
|
||||
use_insecure_ssl = True
|
||||
attempt += 1
|
||||
continue
|
||||
if attempt == retries:
|
||||
return {}
|
||||
time.sleep(backoff)
|
||||
backoff *= 2
|
||||
attempt += 1
|
||||
|
||||
if last_error:
|
||||
return {}
|
||||
return {}
|
||||
|
||||
|
||||
def search_company_keyword(
|
||||
keyword: str,
|
||||
page: int = 1,
|
||||
size: int = 20,
|
||||
job_area: str = "020000",
|
||||
sort_type: str = "0",
|
||||
search_type: str = "2",
|
||||
scene: str = "12",
|
||||
uuid: Optional[str] = None,
|
||||
account_id: Optional[str] = None,
|
||||
user_token: Optional[str] = None,
|
||||
partner: Optional[str] = None,
|
||||
property_obj: Optional[Dict[str, Any]] = None,
|
||||
raw_sink: Optional[Callable[[str], None]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Call open/noauth/search with a company keyword.
|
||||
|
||||
Args:
|
||||
keyword (str): Keyword to search.
|
||||
page (int): Page number.
|
||||
size (int): Page size.
|
||||
job_area (str): Area code.
|
||||
sort_type (str): Sort type.
|
||||
search_type (str): Search type.
|
||||
scene (str): Scene id.
|
||||
uuid/account_id/user_token/partner/property_obj: Header context.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: API response JSON.
|
||||
"""
|
||||
|
||||
params: Dict[str, Any] = {
|
||||
"userLonLat": "",
|
||||
"sortType": sort_type,
|
||||
"keyword": keyword,
|
||||
"pageSize": str(size),
|
||||
"pageNum": str(page),
|
||||
"jobArea": job_area,
|
||||
"landmark": "",
|
||||
"radius": "",
|
||||
"workYear": "",
|
||||
"degree": "",
|
||||
"companyType": "",
|
||||
"companySize": "",
|
||||
"salary": "NaN-NaN",
|
||||
"jobType": "",
|
||||
"metro": "",
|
||||
"function": "",
|
||||
"industry": "",
|
||||
"issueDate": "",
|
||||
"searchType": search_type,
|
||||
"scene": scene,
|
||||
}
|
||||
return _request(
|
||||
method="GET",
|
||||
path="open/noauth/search",
|
||||
params=params,
|
||||
uuid=uuid,
|
||||
account_id=account_id,
|
||||
user_token=user_token,
|
||||
partner=partner,
|
||||
property_obj=property_obj,
|
||||
raw_sink=raw_sink,
|
||||
)
|
||||
|
||||
|
||||
def company_jobs_by_id(
|
||||
co_id: str,
|
||||
page: int = 1,
|
||||
size: int = 20,
|
||||
uuid: Optional[str] = None,
|
||||
account_id: Optional[str] = None,
|
||||
user_token: Optional[str] = None,
|
||||
partner: Optional[str] = None,
|
||||
property_obj: Optional[Dict[str, Any]] = None,
|
||||
raw_sink: Optional[Callable[[str], None]] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""Fetch jobs for a company by numeric id using POST.
|
||||
|
||||
Args:
|
||||
co_id (str): Numeric company id.
|
||||
page (int): Page number.
|
||||
size (int): Page size.
|
||||
uuid/account_id/user_token/partner/property_obj: Header context.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: API response JSON.
|
||||
"""
|
||||
|
||||
body: Dict[str, Any] = {
|
||||
"pageNum": page,
|
||||
"pageSize": size,
|
||||
"coId": co_id,
|
||||
"scene": 14,
|
||||
"requestId": "",
|
||||
}
|
||||
return _request(
|
||||
method="POST",
|
||||
path="open/noauth/jobs/company",
|
||||
body=body,
|
||||
uuid=uuid,
|
||||
account_id=account_id,
|
||||
user_token=user_token,
|
||||
partner=partner,
|
||||
property_obj=property_obj,
|
||||
raw_sink=raw_sink,
|
||||
)
|
||||
|
||||
|
||||
def _extract_items(resp: Dict[str, Any]) -> list:
|
||||
"""Extract the first list of items from a nested response.
|
||||
|
||||
This function searches common keys first, then falls back to a recursive
|
||||
traversal to find the first list encountered. It is resilient to schema
|
||||
variations of the API response.
|
||||
|
||||
Args:
|
||||
resp (Dict[str, Any]): Parsed JSON response.
|
||||
|
||||
Returns:
|
||||
list: The extracted items list; empty when not found or no data.
|
||||
"""
|
||||
|
||||
if not isinstance(resp, dict):
|
||||
return []
|
||||
|
||||
# Prefer job items under resultbody/job/items
|
||||
rb = resp.get("resultbody") or resp.get("resultBody")
|
||||
if isinstance(rb, dict):
|
||||
job_node = rb.get("job")
|
||||
if isinstance(job_node, dict) and isinstance(job_node.get("items"), list):
|
||||
return job_node.get("items", [])
|
||||
|
||||
preferred_keys = (
|
||||
"items",
|
||||
"list",
|
||||
"jobs",
|
||||
"jobList",
|
||||
"companies",
|
||||
"companyList",
|
||||
"resultList",
|
||||
"dataList",
|
||||
)
|
||||
|
||||
for key in preferred_keys:
|
||||
val = resp.get(key)
|
||||
if isinstance(val, list):
|
||||
return val
|
||||
|
||||
def _walk(node: Any) -> Optional[list]:
|
||||
if isinstance(node, list):
|
||||
return node
|
||||
if isinstance(node, dict):
|
||||
for k in preferred_keys:
|
||||
v = node.get(k)
|
||||
if isinstance(v, list):
|
||||
return v
|
||||
for v in node.values():
|
||||
found = _walk(v)
|
||||
if isinstance(found, list):
|
||||
return found
|
||||
return None
|
||||
|
||||
found = _walk(resp)
|
||||
return found or []
|
||||
|
||||
|
||||
def _get_local_ip() -> str:
|
||||
"""Get local IP address for forwarding header.
|
||||
|
||||
Returns:
|
||||
str: Local IP string.
|
||||
"""
|
||||
|
||||
try:
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
ip = s.getsockname()[0]
|
||||
s.close()
|
||||
return ip
|
||||
except Exception:
|
||||
return "127.0.0.1"
|
||||
|
||||
|
||||
def _get_ssl_context(insecure: bool = False) -> ssl.SSLContext:
|
||||
"""Return SSL context, optionally unverified.
|
||||
|
||||
Args:
|
||||
insecure (bool): Whether to disable certificate verification.
|
||||
|
||||
Returns:
|
||||
ssl.SSLContext: Configured SSL context.
|
||||
"""
|
||||
|
||||
if insecure:
|
||||
try:
|
||||
return ssl._create_unverified_context()
|
||||
except Exception:
|
||||
ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
|
||||
ctx.check_hostname = False
|
||||
ctx.verify_mode = ssl.CERT_NONE
|
||||
return ctx
|
||||
try:
|
||||
return ssl.create_default_context()
|
||||
except Exception:
|
||||
return ssl._create_unverified_context()
|
||||
|
||||
def _get_db_path() -> str:
|
||||
"""Return default SQLite DB path for storing raw responses.
|
||||
|
||||
Returns:
|
||||
str: Absolute file path to SQLite database.
|
||||
"""
|
||||
|
||||
base_dir = os.path.dirname(__file__)
|
||||
return os.path.join(base_dir, "qcwy_raw.sqlite3")
|
||||
|
||||
|
||||
def _init_db(db_path: str) -> None:
|
||||
"""Initialize SQLite database with responses table if absent.
|
||||
|
||||
Args:
|
||||
db_path (str): Path to SQLite database file.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS responses (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
keyword TEXT NOT NULL,
|
||||
page INTEGER NOT NULL,
|
||||
created_at INTEGER NOT NULL,
|
||||
payload TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_responses_keyword_page
|
||||
ON responses(keyword, page)
|
||||
"""
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _save_raw_response(db_path: str, keyword: str, page: int, raw_payload: str) -> None:
|
||||
"""Persist raw HTTP response payload into SQLite without modification.
|
||||
|
||||
Args:
|
||||
db_path (str): Path to SQLite database file.
|
||||
keyword (str): Search keyword.
|
||||
page (int): Page number for the response.
|
||||
raw_payload (str): Raw JSON text as received.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT OR IGNORE INTO responses(keyword, page, created_at, payload) VALUES(?, ?, ?, ?)",
|
||||
(keyword, int(page), int(time.time()), raw_payload),
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
def _has_page_record(db_path: str, keyword: str, page: int) -> bool:
|
||||
"""Check if the given keyword+page already exists in SQLite."""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT 1 FROM responses WHERE keyword=? AND page=? LIMIT 1", (keyword, int(page)))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
return row is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _make_item_key(it: Any) -> str:
|
||||
"""Return a stable deduplication key for an item."""
|
||||
|
||||
try:
|
||||
if isinstance(it, dict):
|
||||
for k in ("jobId", "id", "job_id", "positionId"):
|
||||
v = it.get(k)
|
||||
if v is not None:
|
||||
return f"id:{v}"
|
||||
return "hash:" + json.dumps(it, ensure_ascii=False, sort_keys=True)
|
||||
return "val:" + str(it)
|
||||
except Exception:
|
||||
return "val:" + str(it)
|
||||
def _sleep_between_requests(min_seconds: float = 1.0, max_seconds: float = 3.0) -> None:
|
||||
"""Sleep for a random duration between min_seconds and max_seconds.
|
||||
|
||||
Args:
|
||||
min_seconds (float): Minimum seconds to sleep.
|
||||
max_seconds (float): Maximum seconds to sleep.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
try:
|
||||
dur = random.uniform(min_seconds, max_seconds)
|
||||
time.sleep(dur)
|
||||
except Exception:
|
||||
time.sleep(min_seconds)
|
||||
|
||||
|
||||
def _record_company_success(company_name: Optional[str]) -> None:
|
||||
"""Append successful company name to success log file once per process.
|
||||
|
||||
Args:
|
||||
company_name (Optional[str]): Company name string.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
try:
|
||||
name = (company_name or "").strip()
|
||||
if not name or name in _SUCCESS_WRITTEN:
|
||||
return
|
||||
with open(SUCCESS_LOG_PATH, "a", encoding="utf-8") as f:
|
||||
f.write(f"{name}\n")
|
||||
_SUCCESS_WRITTEN.add(name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _extract_company_name(info: Dict[str, Any]) -> Optional[str]:
|
||||
"""Extract company name from company info payload across common keys.
|
||||
|
||||
Args:
|
||||
info (Dict[str, Any]): Company info dict.
|
||||
|
||||
Returns:
|
||||
Optional[str]: Company name when found.
|
||||
"""
|
||||
|
||||
keys = (
|
||||
"coname",
|
||||
"coName",
|
||||
"fullCompanyName",
|
||||
"companyName",
|
||||
"fullname",
|
||||
"name",
|
||||
)
|
||||
for k in keys:
|
||||
v = info.get(k)
|
||||
if isinstance(v, str) and v.strip():
|
||||
return v.strip()
|
||||
for parent in ("basicinfo", "basicInfo"):
|
||||
node = info.get(parent)
|
||||
if isinstance(node, dict):
|
||||
for k in keys:
|
||||
v = node.get(k)
|
||||
if isinstance(v, str) and v.strip():
|
||||
return v.strip()
|
||||
return None
|
||||
|
||||
def _report_universal(items: list, data_type: str = "job") -> bool:
|
||||
"""Post items list to universal batch-store-async endpoint.
|
||||
|
||||
Args:
|
||||
items (list): Data list to send.
|
||||
data_type (str): Logical data type label.
|
||||
|
||||
Returns:
|
||||
bool: True when accepted, else False.
|
||||
"""
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _extract_total_count(resp: Dict[str, Any]) -> Optional[int]:
|
||||
"""直接从响应中读取 job.totalCount 字段。
|
||||
|
||||
Args:
|
||||
resp (Dict[str, Any]): 解析后的响应 JSON。
|
||||
|
||||
Returns:
|
||||
Optional[int]: 总数,若不存在则为 None。
|
||||
"""
|
||||
|
||||
if not isinstance(resp, dict):
|
||||
return None
|
||||
|
||||
def _direct_get(path: tuple) -> Optional[int]:
|
||||
node: Any = resp
|
||||
for key in path:
|
||||
if not isinstance(node, dict):
|
||||
return None
|
||||
node = node.get(key)
|
||||
if isinstance(node, dict):
|
||||
tc = node.get("totalCount")
|
||||
if isinstance(tc, int):
|
||||
return tc
|
||||
if isinstance(tc, str):
|
||||
s = tc.strip()
|
||||
if s.isdigit():
|
||||
return int(s)
|
||||
return None
|
||||
|
||||
for p in (
|
||||
("resultbody", "job"),
|
||||
("resultBody", "job"),
|
||||
("job",),
|
||||
("jobs",),
|
||||
("result", "job"),
|
||||
("data", "job"),
|
||||
("payload", "job"),
|
||||
):
|
||||
v = _direct_get(p)
|
||||
if isinstance(v, int):
|
||||
return v
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def paginate_search_company_keyword(
|
||||
keyword: str,
|
||||
size: int = 20,
|
||||
job_area: str = "020000",
|
||||
sort_type: str = "0",
|
||||
search_type: str = "2",
|
||||
scene: str = "12",
|
||||
start_page: int = 1,
|
||||
max_pages: Optional[int] = None,
|
||||
delay: float = 0.2,
|
||||
verbose: bool = False,
|
||||
db_path: Optional[str] = None,
|
||||
) -> list:
|
||||
"""Iterate pages for company keyword search until no data.
|
||||
|
||||
Args:
|
||||
keyword (str): Keyword to search.
|
||||
size (int): Page size per request.
|
||||
job_area (str): Area code.
|
||||
sort_type (str): Sort type.
|
||||
search_type (str): Search type.
|
||||
scene (str): Scene id.
|
||||
start_page (int): Starting page number.
|
||||
max_pages (Optional[int]): Maximum pages to fetch; None for unlimited.
|
||||
delay (float): Delay seconds between requests.
|
||||
verbose (bool): Whether to print per-page stats.
|
||||
|
||||
Returns:
|
||||
list: Aggregated items across pages.
|
||||
"""
|
||||
|
||||
results: list = []
|
||||
seen_keys: set = set()
|
||||
page = start_page
|
||||
fetched_pages = 0
|
||||
total_count: Optional[int] = None
|
||||
|
||||
db_path = db_path or _get_db_path()
|
||||
_init_db(db_path)
|
||||
|
||||
while True:
|
||||
if max_pages is not None and fetched_pages >= max_pages:
|
||||
break
|
||||
|
||||
if verbose:
|
||||
print(json.dumps({"fetching_page": page}, ensure_ascii=False))
|
||||
|
||||
# Skip crawling when this page is already recorded
|
||||
if _has_page_record(db_path, keyword, page):
|
||||
if verbose:
|
||||
print(json.dumps({"page": page, "skipped": True}, ensure_ascii=False))
|
||||
page += 1
|
||||
fetched_pages += 1
|
||||
ms = delay if delay > 0 else 0.2
|
||||
mx = ms * 2
|
||||
_sleep_between_requests(ms, mx)
|
||||
continue
|
||||
|
||||
def _sink(raw: str) -> None:
|
||||
_save_raw_response(db_path, keyword, page, raw)
|
||||
|
||||
resp = search_company_keyword(
|
||||
keyword=keyword,
|
||||
page=page,
|
||||
size=size,
|
||||
job_area=job_area,
|
||||
sort_type=sort_type,
|
||||
search_type=search_type,
|
||||
scene=scene,
|
||||
raw_sink=_sink,
|
||||
)
|
||||
if verbose:
|
||||
print(json.dumps({"page": page, "saved": True}, ensure_ascii=False))
|
||||
if total_count is None:
|
||||
total_count = _extract_total_count(resp)
|
||||
if verbose and total_count is not None:
|
||||
print(json.dumps({"totalCount": total_count}, ensure_ascii=False))
|
||||
|
||||
items = _extract_items(resp)
|
||||
filtered: list = []
|
||||
for it in items:
|
||||
key = _make_item_key(it)
|
||||
if key in seen_keys:
|
||||
continue
|
||||
seen_keys.add(key)
|
||||
filtered.append(it)
|
||||
if verbose:
|
||||
print(json.dumps({"page": page, "items_on_page": len(items), "unique_added": len(filtered)}, ensure_ascii=False))
|
||||
|
||||
if not filtered:
|
||||
break
|
||||
results.extend(filtered)
|
||||
|
||||
if total_count is not None:
|
||||
if len(results) >= total_count:
|
||||
break
|
||||
|
||||
page += 1
|
||||
fetched_pages += 1
|
||||
ms = delay if delay > 0 else 0.2
|
||||
mx = ms * 2
|
||||
_sleep_between_requests(ms, mx)
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Static configuration for pagination demo
|
||||
CONFIG: Dict[str, Any] = {
|
||||
"keyword": "字节跳动",
|
||||
"size": 20,
|
||||
"job_area": "020000",
|
||||
"sort_type": "0",
|
||||
"search_type": "2",
|
||||
"scene": "12",
|
||||
"start_page": 1,
|
||||
"max_pages": None,
|
||||
"delay": 0.2,
|
||||
"verbose": False,
|
||||
"db_path": None,
|
||||
}
|
||||
|
||||
|
||||
def main(keyword: str) -> None:
|
||||
"""Run a demo of keyword search pagination until no data using static config.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
cfg = CONFIG
|
||||
|
||||
results = paginate_search_company_keyword(
|
||||
keyword=keyword,
|
||||
size=cfg["size"],
|
||||
job_area=cfg["job_area"],
|
||||
sort_type=cfg["sort_type"],
|
||||
search_type=cfg["search_type"],
|
||||
scene=cfg["scene"],
|
||||
start_page=cfg["start_page"],
|
||||
max_pages=cfg["max_pages"],
|
||||
delay=cfg["delay"],
|
||||
verbose=cfg["verbose"],
|
||||
db_path=cfg["db_path"],
|
||||
)
|
||||
print(json.dumps({"total_items": len(results)}, ensure_ascii=False))
|
||||
|
||||
|
||||
def get_company_info(company_id: str) -> Dict[str, Any]:
|
||||
"""Fetch company details with caching.
|
||||
|
||||
Args:
|
||||
company_id (str): Company identifier string.
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: Company information dict; empty dict when not found.
|
||||
"""
|
||||
|
||||
if not company_id:
|
||||
return {}
|
||||
cached = COMPANY_INFO_CACHE.get(company_id)
|
||||
if isinstance(cached, dict) and cached:
|
||||
return cached
|
||||
|
||||
params = {
|
||||
"companyId": company_id,
|
||||
"colorOne": "#ffffff",
|
||||
"colorTwo": "#ffffffcc",
|
||||
}
|
||||
property_obj = {
|
||||
"frompageUrl": "",
|
||||
"pageUrl": "",
|
||||
"isLogin": "否",
|
||||
"accountid": "",
|
||||
"resumeId": "",
|
||||
"firstFrompageUrl": "",
|
||||
"distinct_id": str(_uuid.uuid4()),
|
||||
"pageCode": "companyDetail|company|companyinfo",
|
||||
"shortPageCode": "companyDetail|company|companyinfo",
|
||||
}
|
||||
|
||||
try:
|
||||
resp = _request(
|
||||
method="GET",
|
||||
path="open/noauth/company-info/info-data",
|
||||
params=params,
|
||||
property_obj=property_obj,
|
||||
)
|
||||
except Exception:
|
||||
resp = {}
|
||||
if resp and resp.get("status") in (1, "1"):
|
||||
info = resp.get("resultbody", {})
|
||||
if isinstance(info, dict) and info:
|
||||
COMPANY_INFO_CACHE[company_id] = info
|
||||
name = _extract_company_name(info)
|
||||
_record_company_success(name)
|
||||
return info
|
||||
return {}
|
||||
return {}
|
||||
|
||||
|
||||
def _enrich_items_with_company_info(resp: Dict[str, Any]) -> list:
|
||||
"""Attach company info to items using cache.
|
||||
|
||||
Args:
|
||||
resp (Dict[str, Any]): Parsed JSON response.
|
||||
|
||||
Returns:
|
||||
list: Items with company_info fields when available.
|
||||
"""
|
||||
items = resp.get("resultbody", {}).get("job", {}).get("items", [])
|
||||
enriched = []
|
||||
for it in items:
|
||||
target = dict(it) if isinstance(it, dict) else {"_value": it}
|
||||
co_id = target.get("coId") or target.get("companyId")
|
||||
job_id = target.get("jobId")
|
||||
city_pinyin = target.get("hrefAreaPinYin")
|
||||
if co_id:
|
||||
info = get_company_info(str(co_id))
|
||||
if info:
|
||||
target["company_info"] = info
|
||||
target["company_desc"] = (info.get("coinfo", {}) or {}).get("coinfo")
|
||||
target["companyHref"] = (info.get("share", {}) or {}).get("weixinshareurl")
|
||||
target["jobHref"] = f"https://jobs.51job.com/{city_pinyin}/{job_id}.html"
|
||||
nm = _extract_company_name(info) or target.get("fullCompanyName") or target.get("companyName")
|
||||
_record_company_success(nm)
|
||||
_sleep_between_requests()
|
||||
enriched.append(target)
|
||||
return enriched
|
||||
|
||||
|
||||
def _load_keywords(path: str) -> list:
|
||||
"""Load keywords from a UTF-8 text file, one per line.
|
||||
|
||||
Args:
|
||||
path (str): File path.
|
||||
|
||||
Returns:
|
||||
list: Non-empty trimmed lines.
|
||||
"""
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
lines = [ln.strip() for ln in f.readlines()]
|
||||
return [ln for ln in lines if ln]
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def _progress_iter(seq: list, desc: str = "", total: Optional[int] = None):
|
||||
"""Iterate with a simple console progress bar.
|
||||
|
||||
Args:
|
||||
seq (list): Items to iterate.
|
||||
desc (str): Progress description.
|
||||
total (Optional[int]): Total count for percentage.
|
||||
|
||||
Yields:
|
||||
Any: Items from seq.
|
||||
"""
|
||||
|
||||
n = 0
|
||||
m = total if total is not None else len(seq)
|
||||
bar_len = 24
|
||||
for item in seq:
|
||||
n += 1
|
||||
filled = int(bar_len * n / m) if m else 0
|
||||
bar = "#" * filled + "-" * (bar_len - filled)
|
||||
pct = int(100 * n / m) if m else 100
|
||||
print(f"\r{desc} [{bar}] {n}/{m} {pct}%", end="", flush=True)
|
||||
yield item
|
||||
print("", flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
base_dir = os.path.dirname(__file__)
|
||||
fp = os.path.join(base_dir, "company.txt")
|
||||
td = _load_keywords(fp) or [CONFIG.get("keyword")]
|
||||
for keyword in _progress_iter(td, desc="Keywords", total=len(td)):
|
||||
print(keyword)
|
||||
main(keyword)
|
||||
_sleep_between_requests()
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,54 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
if command -v apt-get >/dev/null; then
|
||||
if command -v sudo >/dev/null; then
|
||||
sudo apt-get update -y
|
||||
sudo apt-get install -y unzip python3 python3-pip tmux
|
||||
else
|
||||
apt-get update -y
|
||||
apt-get install -y unzip python3 python3-pip tmux
|
||||
fi
|
||||
fi
|
||||
python3 -m pip install -U requests loguru httpx fake-useragent PySocks --break-system-packages
|
||||
use_tmux=0
|
||||
if command -v tmux >/dev/null; then
|
||||
use_tmux=1
|
||||
fi
|
||||
tmux_session="jobs_spider"
|
||||
if [ "$use_tmux" -eq 1 ]; then
|
||||
if ! tmux has-session -t "$tmux_session" 2>/dev/null; then
|
||||
tmux new-session -d -s "$tmux_session" -c "$SCRIPT_DIR"
|
||||
fi
|
||||
fi
|
||||
start_one() {
|
||||
name="$1"
|
||||
dir="$2"
|
||||
entry="$3"
|
||||
cd "$dir"
|
||||
mkdir -p logs
|
||||
if pgrep -f "$entry" >/dev/null; then
|
||||
echo "$name 已在运行"
|
||||
return 0
|
||||
fi
|
||||
if [ "$use_tmux" -eq 1 ]; then
|
||||
if tmux list-windows -t "$tmux_session" 2>/dev/null | awk -F: '{print $2}' | awk '{print $1}' | grep -qx "$name"; then
|
||||
if tmux respawn-window -t "$tmux_session:$name" -k -c "$dir" "python3 $entry >> logs/runner.log 2>&1"; then
|
||||
:
|
||||
else
|
||||
tmux kill-window -t "$tmux_session:$name" 2>/dev/null || true
|
||||
tmux new-window -t "$tmux_session" -n "$name" -c "$dir" "python3 $entry >> logs/runner.log 2>&1"
|
||||
fi
|
||||
else
|
||||
tmux new-window -t "$tmux_session" -n "$name" -c "$dir" "python3 $entry >> logs/runner.log 2>&1"
|
||||
fi
|
||||
echo "$name tmux: $tmux_session:$name"
|
||||
else
|
||||
nohup python3 "$entry" >> logs/runner.log 2>&1 &
|
||||
echo "$name PID: $!"
|
||||
fi
|
||||
}
|
||||
start_one "boss" "$SCRIPT_DIR/boss" "boos_api.py"
|
||||
start_one "qcwy" "$SCRIPT_DIR/qcwy" "qcwy.py"
|
||||
start_one "zhilian" "$SCRIPT_DIR/zhilian" "zhilian_single.py"
|
||||
@ -1,38 +0,0 @@
|
||||
# 使用Python 3.9作为基础镜像
|
||||
FROM python:3.9-slim
|
||||
|
||||
# 设置工作目录
|
||||
WORKDIR /app
|
||||
|
||||
# 设置环境变量
|
||||
ENV PYTHONUNBUFFERED=1 \
|
||||
PYTHONDONTWRITEBYTECODE=1 \
|
||||
PIP_NO_CACHE_DIR=1 \
|
||||
PIP_DISABLE_PIP_VERSION_CHECK=1
|
||||
|
||||
# 配置环境变量(可通过docker run -e 覆盖)
|
||||
ENV API_BASE_URL=http://124.222.245.240:9999 \
|
||||
MONGODB_URI=mongodb://localhost:27017 \
|
||||
MONGODB_DB=job_data \
|
||||
MAX_PAGES=3 \
|
||||
PAGE_SIZE=15 \
|
||||
MIN_WAIT_TIME=10 \
|
||||
MAX_WAIT_TIME=30 \
|
||||
ERROR_WAIT_MIN=30 \
|
||||
ERROR_WAIT_MAX=60
|
||||
|
||||
# 复制requirements文件并安装Python依赖
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# 复制项目文件
|
||||
COPY boos_api.py ./
|
||||
COPY city.json ./
|
||||
COPY work.json ./
|
||||
|
||||
# 创建非root用户
|
||||
RUN useradd -m -u 1000 crawler && chown -R crawler:crawler /app
|
||||
USER crawler
|
||||
|
||||
# 启动命令
|
||||
CMD ["python", "boos_api.py"]
|
||||
@ -1,855 +0,0 @@
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import pprint
|
||||
import random
|
||||
import uuid
|
||||
import hashlib
|
||||
from typing import Any, Dict, Optional, Callable
|
||||
|
||||
try:
|
||||
import requests
|
||||
except Exception:
|
||||
requests = None
|
||||
import ssl
|
||||
from urllib.request import Request, urlopen, ProxyHandler, build_opener, HTTPSHandler
|
||||
from urllib.parse import urlencode
|
||||
import sqlite3
|
||||
SUCCESS_LOG_PATH = os.path.join(os.path.dirname(__file__), "success.txt")
|
||||
_SUCCESS_WRITTEN: set = set()
|
||||
try:
|
||||
import httpx
|
||||
except Exception:
|
||||
httpx = None
|
||||
API_BASE_URL = os.getenv("API_BASE_URL", "http://127.0.0.1:9999")
|
||||
|
||||
def _build_proxy() -> Optional[Dict[str, str]]:
|
||||
"""构造代理配置字典(requests/httpx/urllib 兼容)。
|
||||
|
||||
从环境变量读取:
|
||||
- ZP_PROXY_URL: 完整代理URL,如 http://user:pass@host:port
|
||||
或组合:
|
||||
- ZP_PROXY_USERNAME, ZP_PROXY_PASSWORD, ZP_PROXY_TUNNEL
|
||||
|
||||
Returns:
|
||||
Optional[Dict[str, str]]: {'http': url, 'https': url} 或 None。
|
||||
"""
|
||||
|
||||
url ="http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
|
||||
return {"http": url, "https": url}
|
||||
|
||||
def _get_user_agent(mobile: bool = True) -> str:
|
||||
try:
|
||||
from fake_useragent import UserAgent
|
||||
ua = UserAgent(platforms=['mobile'] if mobile else None)
|
||||
return ua.random
|
||||
except Exception:
|
||||
if mobile:
|
||||
return "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Mobile Safari/537.36"
|
||||
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||
|
||||
def gen_page_request_id() -> str:
|
||||
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
|
||||
|
||||
def gen_client_id() -> str:
|
||||
t = int(time.time() * 1000)
|
||||
try:
|
||||
t += int(time.perf_counter() * 1000)
|
||||
except Exception:
|
||||
pass
|
||||
def repl(c: str) -> str:
|
||||
n = int((t + random.random() * 16) % 16)
|
||||
if c == 'x':
|
||||
return hex(n)[2:]
|
||||
return hex((n & 0x3) | 0x8)[2:]
|
||||
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
|
||||
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
|
||||
|
||||
def gen_v() -> float:
|
||||
return round(random.random(), 8)
|
||||
|
||||
def build_headers_miniapp(user_agent: str) -> Dict[str, str]:
|
||||
return {
|
||||
'User-Agent': user_agent,
|
||||
'x-zp-page-code': "7020",
|
||||
'x-zp-rt': hashlib.md5(f"{uuid.uuid4()}-{time.time()}".encode("utf-8")).hexdigest(),
|
||||
'x-zp-device-id': str(uuid.uuid4()).upper(),
|
||||
'content-type': "application/json",
|
||||
'x-zp-version': "0.0.0",
|
||||
'x-zp-business-system': "73",
|
||||
'x-zp-action-id': "",
|
||||
'xweb_xhr': "1",
|
||||
'x-zp-channel': "wxxiaochengxu",
|
||||
'x-zp-platform': "12",
|
||||
'sec-fetch-site': "cross-site",
|
||||
'sec-fetch-mode': "cors",
|
||||
'sec-fetch-dest': "empty",
|
||||
'referer': "https://servicewechat.com/wxb7718fb9257e4fd2/529/page-frame.html",
|
||||
'accept-language': "zh-CN,zh;q=0.9",
|
||||
}
|
||||
|
||||
def _get_db_path() -> str:
|
||||
"""返回默认 SQLite 数据库文件路径。"""
|
||||
base_dir = os.path.dirname(__file__)
|
||||
return os.path.join(base_dir, "zhilian_raw.sqlite3")
|
||||
|
||||
|
||||
def _init_db(db_path: str) -> None:
|
||||
"""初始化 SQLite 数据库并创建表。"""
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS responses (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
sou_full_index TEXT,
|
||||
page INTEGER,
|
||||
created_at INTEGER,
|
||||
payload TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE TABLE IF NOT EXISTS company_details (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
number TEXT,
|
||||
created_at INTEGER,
|
||||
payload TEXT
|
||||
)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_responses_sou_page
|
||||
ON responses(sou_full_index, page)
|
||||
"""
|
||||
)
|
||||
cur.execute(
|
||||
"""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_company_details_number
|
||||
ON company_details(number)
|
||||
"""
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _save_search_response(db_path: str, sou_full_index: str, page: int, raw_payload: str) -> None:
|
||||
"""保存职位搜索的原始响应。"""
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT OR IGNORE INTO responses(sou_full_index, page, created_at, payload) VALUES(?, ?, ?, ?)",
|
||||
(sou_full_index, int(page), int(time.time()), raw_payload),
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def _save_company_detail(db_path: str, number: str, raw_payload: str) -> None:
|
||||
"""保存公司详情的原始响应。"""
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute(
|
||||
"INSERT OR IGNORE INTO company_details(number, created_at, payload) VALUES(?, ?, ?)",
|
||||
(number, int(time.time()), raw_payload),
|
||||
)
|
||||
con.commit()
|
||||
con.close()
|
||||
except Exception:
|
||||
pass
|
||||
def _has_company_detail(db_path: str, number: str) -> bool:
|
||||
"""检查公司详情是否已存在(按职位编号 number)。"""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT 1 FROM company_details WHERE number=? LIMIT 1", (number,))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
return row is not None
|
||||
except Exception:
|
||||
return False
|
||||
def _has_page_record(db_path: str, sou_full_index: str, page: int) -> bool:
|
||||
"""检查指定关键词与页码是否已经存在。"""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? AND page=? LIMIT 1", (sou_full_index, int(page)))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
return row is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def _sleep_between_requests(min_seconds: float = 0.3, max_seconds: float = 0.8) -> None:
|
||||
"""在请求间进行随机休眠。"""
|
||||
|
||||
try:
|
||||
dur = random.uniform(min_seconds, max_seconds)
|
||||
time.sleep(dur)
|
||||
except Exception:
|
||||
time.sleep(min_seconds)
|
||||
|
||||
def _has_keyword_record(db_path: str, sou_full_index: str) -> bool:
|
||||
"""检查指定关键词是否已有任意页记录。"""
|
||||
|
||||
try:
|
||||
con = sqlite3.connect(db_path)
|
||||
cur = con.cursor()
|
||||
cur.execute("SELECT 1 FROM responses WHERE sou_full_index=? LIMIT 1", (sou_full_index,))
|
||||
row = cur.fetchone()
|
||||
con.close()
|
||||
return row is not None
|
||||
except Exception:
|
||||
return False
|
||||
def _request_json(method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30, max_retries: int = 3,
|
||||
proxies: Optional[Dict[str, str]] = None, raw_sink: Optional[Callable[[str], None]] = None) -> Optional[Dict[str, Any]]:
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
time.sleep(random.uniform(0.8, 2.5))
|
||||
if httpx is not None:
|
||||
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
|
||||
debug = os.getenv("ZP_DEBUG", "0") == "1"
|
||||
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
|
||||
if proxies:
|
||||
px = proxies.get("https") or proxies.get("http")
|
||||
if px:
|
||||
kwargs["proxies"] = px
|
||||
with httpx.Client(**kwargs) as client:
|
||||
method_u = method.upper()
|
||||
if method_u == "GET":
|
||||
resp = client.get(url, params=params)
|
||||
else:
|
||||
resp = client.post(url, json=json_body)
|
||||
if debug:
|
||||
print({"_request_json": {"method": "POST", "status": resp.status_code}})
|
||||
if resp.status_code == 405:
|
||||
merged = params or {}
|
||||
if json_body:
|
||||
merged = {**merged, **{k: str(v) for k, v in json_body.items()}}
|
||||
resp = client.get(url, params=merged)
|
||||
if debug:
|
||||
print({"_request_json": {"fallback": "GET", "status": resp.status_code}})
|
||||
if raw_sink:
|
||||
try:
|
||||
raw_sink(resp.text)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return json.loads(resp.text)
|
||||
else:
|
||||
if requests:
|
||||
resp = requests.request(
|
||||
method.upper(), url,
|
||||
headers=headers, params=params, json=json_body,
|
||||
timeout=timeout, proxies=proxies
|
||||
)
|
||||
resp.raise_for_status()
|
||||
if raw_sink:
|
||||
try:
|
||||
raw_sink(resp.text)
|
||||
except Exception:
|
||||
pass
|
||||
return resp.json()
|
||||
if method.upper() == 'GET':
|
||||
full_url = url
|
||||
if params:
|
||||
qs = urlencode(params)
|
||||
full_url = f"{url}?{qs}"
|
||||
req = Request(full_url, headers=headers, method='GET')
|
||||
else:
|
||||
data_bytes = json.dumps(json_body or {}).encode('utf-8')
|
||||
req = Request(url, headers=headers, data=data_bytes, method='POST')
|
||||
ctx = ssl.create_default_context()
|
||||
opener = None
|
||||
if proxies and isinstance(proxies, dict) and (proxies.get("http") or proxies.get("https")):
|
||||
try:
|
||||
ph = ProxyHandler(proxies)
|
||||
opener = build_opener(ph, HTTPSHandler(context=ctx))
|
||||
except Exception:
|
||||
opener = None
|
||||
if opener:
|
||||
with opener.open(req, timeout=timeout) as r:
|
||||
raw = r.read()
|
||||
else:
|
||||
with urlopen(req, context=ctx, timeout=timeout) as r:
|
||||
raw = r.read()
|
||||
if raw_sink:
|
||||
try:
|
||||
raw_sink(raw.decode("utf-8"))
|
||||
except Exception:
|
||||
pass
|
||||
return json.loads(raw)
|
||||
except Exception:
|
||||
if attempt == max_retries - 1:
|
||||
return None
|
||||
time.sleep(1.2 * (attempt + 1))
|
||||
return None
|
||||
|
||||
def fetch_company_desc_by_job(number: str, db_path: Optional[str] = None) -> Optional[str]:
|
||||
if not isinstance(number, str) or not number.strip():
|
||||
return None
|
||||
if db_path and _has_company_detail(db_path, number):
|
||||
return None
|
||||
client_id = gen_client_id()
|
||||
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
|
||||
params_pc = {
|
||||
"number": number,
|
||||
"_v": gen_v(),
|
||||
"x-zp-page-request-id": gen_page_request_id(),
|
||||
"x-zp-client-id": client_id,
|
||||
}
|
||||
headers_pc = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Encoding": "identity",
|
||||
"sec-ch-ua-platform": "macOS",
|
||||
"x-zp-business-system": "1",
|
||||
"x-zp-page-code": "4019",
|
||||
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"x-zp-platform": "13",
|
||||
"origin": "https://www.zhaopin.com",
|
||||
"sec-fetch-site": "same-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://www.zhaopin.com/",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
"Cookie": f"x-zp-client-id={client_id}"
|
||||
}
|
||||
def _sink_pc(raw: str) -> None:
|
||||
if db_path:
|
||||
_save_company_detail(db_path, number, raw)
|
||||
data_pc = _request_json("GET", url_pc, headers_pc, params=params_pc, proxies=_build_proxy(), raw_sink=_sink_pc)
|
||||
if data_pc and isinstance(data_pc, dict):
|
||||
detail = data_pc.get("data") or {}
|
||||
comp = detail.get("detailedCompany") or {}
|
||||
desc_pc = comp.get("companyDescription")
|
||||
if isinstance(desc_pc, str) and desc_pc:
|
||||
return desc_pc
|
||||
ua = _get_user_agent(True)
|
||||
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
|
||||
params_mini = {
|
||||
"number": number,
|
||||
"platform": "12",
|
||||
"version": "0.0.0",
|
||||
}
|
||||
headers_mini = build_headers_miniapp(ua)
|
||||
def _sink_mini(raw: str) -> None:
|
||||
if db_path:
|
||||
_save_company_detail(db_path, number, raw)
|
||||
data_mini = _request_json("GET", url_mini, headers_mini, params=params_mini, proxies=_build_proxy(), raw_sink=_sink_mini)
|
||||
if data_mini and isinstance(data_mini, dict):
|
||||
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
|
||||
if isinstance(desc_mini, str) and desc_mini:
|
||||
return desc_mini
|
||||
return None
|
||||
|
||||
|
||||
def build_headers(
|
||||
at: str,
|
||||
rt: str,
|
||||
device_id: str,
|
||||
channel: str = "miniapp",
|
||||
platform: str = "miniapp",
|
||||
version: str = "1.0.0",
|
||||
business_system: str = "zpfe-miniapp",
|
||||
page_code: Optional[str] = None,
|
||||
action_id: Optional[str] = None,
|
||||
user_agent: Optional[str] = None,
|
||||
referer: Optional[str] = None,
|
||||
) -> Dict[str, str]:
|
||||
"""生成请求头。
|
||||
|
||||
参数:
|
||||
- at:访问令牌。
|
||||
- rt:刷新/辅助令牌。
|
||||
- device_id:设备标识。
|
||||
- channel:渠道标识。
|
||||
- platform:平台标识。
|
||||
- version:版本号。
|
||||
- business_system:业务系统标识。
|
||||
- page_code:页面编码(注入 x-zp-page-code)。
|
||||
- action_id:动作标识(注入 x-zp-action-id)。
|
||||
- user_agent:UA(可选,默认填充为微信小程序 UA)。
|
||||
- referer:来源(可选)。
|
||||
|
||||
返回:
|
||||
- 头字典,包含公共头与在 cgate 路径下的 x-zp-at/x-zp-rt。
|
||||
"""
|
||||
headers: Dict[str, str] = {
|
||||
"accept": "*/*",
|
||||
"content-type": "application/json",
|
||||
"x-zp-version": version,
|
||||
"x-zp-channel": channel,
|
||||
"x-zp-platform": platform,
|
||||
"x-zp-device-id": device_id,
|
||||
"x-zp-business-system": business_system,
|
||||
"xweb_xhr": "1",
|
||||
"sec-fetch-site": "cross-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
}
|
||||
if at:
|
||||
headers["x-zp-at"] = at
|
||||
if rt:
|
||||
headers["x-zp-rt"] = rt
|
||||
if page_code:
|
||||
headers["x-zp-page-code"] = page_code
|
||||
if action_id is not None:
|
||||
headers["x-zp-action-id"] = action_id
|
||||
headers["User-Agent"] = user_agent or (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 "
|
||||
"MicroMessenger/6.8.0(0x16080000) NetType/WIFI MiniProgramEnv/Mac MacWechat/WMPF "
|
||||
"MacWechat/3.8.10(0x13080a10) XWEB/1227"
|
||||
)
|
||||
if referer:
|
||||
headers["referer"] = referer
|
||||
else:
|
||||
headers["referer"] = "https://servicewechat.com/wxb7718fb9257e4fd2/602/page-frame.html"
|
||||
return headers
|
||||
|
||||
|
||||
def base_url_for(path: str, env: str = "prod") -> str:
|
||||
"""选择基础域名。
|
||||
|
||||
参数:
|
||||
- path:相对路径。
|
||||
- env:环境标识,"prod" 或 "pre"。
|
||||
|
||||
返回:
|
||||
- 基础域名字符串。
|
||||
"""
|
||||
is_pre = env == "pre"
|
||||
if path.startswith("/capi"):
|
||||
return f"https://capi{'pre' if is_pre else ''}.zhaopin.com"
|
||||
if path.startswith("/api"):
|
||||
return f"https://m{'-pre' if is_pre else ''}.zhaopin.com"
|
||||
if path.startswith("/weex"):
|
||||
return f"https://zhibo{'-pre' if is_pre else ''}.zhaopin.com"
|
||||
return f"https://cgate{'-pre' if is_pre else ''}.zhaopin.com"
|
||||
|
||||
|
||||
def build_payload(
|
||||
page_index: int = 1,
|
||||
page_size: int = 10,
|
||||
city_id: Optional[int] = None,
|
||||
event_scenario: Optional[str] = None,
|
||||
sou_expand: Optional[str] = None,
|
||||
sou_full_index: Optional[str] = None,
|
||||
sort_type: Optional[str] = None,
|
||||
resume_number: Optional[str] = None,
|
||||
filter_min_salary: Optional[int] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""构造职位搜索请求体。"""
|
||||
body: Dict[str, Any] = {
|
||||
"pageIndex": page_index,
|
||||
"pageSize": page_size,
|
||||
}
|
||||
if event_scenario:
|
||||
body["eventScenario"] = event_scenario
|
||||
if filter_min_salary is not None:
|
||||
body["filterMinSalary"] = filter_min_salary
|
||||
if sou_expand:
|
||||
body["S_SOU_EXPAND"] = sou_expand
|
||||
if sou_full_index:
|
||||
body["S_SOU_FULL_INDEX"] = sou_full_index
|
||||
if city_id is not None:
|
||||
body["S_SOU_WORK_CITY"] = city_id
|
||||
if sort_type:
|
||||
body["sortType"] = sort_type
|
||||
if resume_number:
|
||||
body["resumeNumber"] = resume_number
|
||||
return body
|
||||
|
||||
|
||||
def call(
|
||||
page_index: int = 1,
|
||||
page_size: int = 10,
|
||||
city_id: Optional[int] = None,
|
||||
env: str = "prod",
|
||||
timeout: float = 10.0,
|
||||
sou_full_index: Optional[str] = None,
|
||||
) -> Any:
|
||||
"""执行职位搜索请求(POST)。"""
|
||||
path = "/positionbusiness/searchrecommend/searchPositions"
|
||||
base = base_url_for(path, env=env)
|
||||
url = f"{base}{path}"
|
||||
|
||||
at = os.getenv("ZP_AT", "")
|
||||
rt = os.getenv("ZP_RT", "")
|
||||
device_id = os.getenv("ZP_DEVICE_ID", "")
|
||||
channel = os.getenv("ZP_CHANNEL", "wxxiaochengxu")
|
||||
platform = os.getenv("ZP_PLATFORM", "12")
|
||||
version = os.getenv("ZP_VERSION", "0.0.0")
|
||||
business_system = os.getenv("ZP_BUSINESS_SYSTEM", "73")
|
||||
page_code = os.getenv("ZP_PAGE_CODE", "7019")
|
||||
action_id = os.getenv("ZP_ACTION_ID", "")
|
||||
user_agent = os.getenv("ZP_USER_AGENT")
|
||||
referer = os.getenv("ZP_REFERER")
|
||||
|
||||
headers = build_headers(
|
||||
at=at,
|
||||
rt=rt,
|
||||
device_id=device_id,
|
||||
channel=channel,
|
||||
platform=platform,
|
||||
version=version,
|
||||
business_system=business_system,
|
||||
page_code=page_code,
|
||||
action_id=action_id,
|
||||
user_agent=user_agent,
|
||||
referer=referer,
|
||||
)
|
||||
|
||||
body_env = os.getenv("ZP_BODY_JSON")
|
||||
if body_env:
|
||||
try:
|
||||
body = json.loads(body_env)
|
||||
except Exception:
|
||||
body = {}
|
||||
else:
|
||||
body = build_payload(
|
||||
page_index=page_index,
|
||||
page_size=page_size,
|
||||
city_id=city_id,
|
||||
event_scenario=os.getenv("ZP_EVENT_SCENARIO", "wxmpZhaopinSearchV2"),
|
||||
sou_expand=os.getenv("ZP_SOU_EXPAND", "SOU_COMPANY_ID"),
|
||||
sou_full_index=sou_full_index or os.getenv("ZP_SOU_FULL_INDEX"),
|
||||
sort_type=os.getenv("ZP_SORT_TYPE", "DEFAULT"),
|
||||
resume_number=os.getenv("ZP_RESUME_NUMBER"),
|
||||
filter_min_salary=int(os.getenv("ZP_FILTER_MIN_SALARY", "1")),
|
||||
)
|
||||
if sou_full_index:
|
||||
body["S_SOU_FULL_INDEX"] = sou_full_index
|
||||
|
||||
use_http2 = os.getenv("ZP_HTTP2", "1") == "1"
|
||||
proxies = _build_proxy()
|
||||
debug = os.getenv("ZP_DEBUG", "0") == "1"
|
||||
if httpx is not None:
|
||||
kwargs: Dict[str, Any] = {"http2": use_http2, "timeout": timeout, "headers": headers, "trust_env": False}
|
||||
if proxies:
|
||||
px = proxies.get("https") or proxies.get("http")
|
||||
if px:
|
||||
kwargs["proxies"] = px
|
||||
with httpx.Client(**kwargs) as client:
|
||||
resp = client.post(url, json=body)
|
||||
if debug:
|
||||
print({"method": "POST", "status": resp.status_code})
|
||||
if resp.status_code == 405:
|
||||
params = {k: str(v) for k, v in body.items()}
|
||||
resp = client.get(url, params=params)
|
||||
if debug:
|
||||
print({"fallback": "GET", "status": resp.status_code})
|
||||
try:
|
||||
os.environ["ZP_LAST_RAW"] = resp.text
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return resp.text
|
||||
else:
|
||||
resp = requests.post(url, json=body, headers=headers, timeout=timeout, proxies=proxies) if requests is not None else None
|
||||
if resp is not None:
|
||||
if debug:
|
||||
try:
|
||||
print({"method": "POST", "status": resp.status_code})
|
||||
except Exception:
|
||||
pass
|
||||
if getattr(resp, "status_code", None) == 405:
|
||||
params = {k: str(v) for k, v in body.items()}
|
||||
resp = requests.get(url, params=params, headers=headers, timeout=timeout, proxies=proxies)
|
||||
if debug:
|
||||
try:
|
||||
print({"fallback": "GET", "status": resp.status_code})
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
return resp.json()
|
||||
except ValueError:
|
||||
return resp.text
|
||||
return {"error": "httpx not available"}
|
||||
|
||||
|
||||
def _load_lines() -> list:
|
||||
base_dir = os.path.dirname(__file__)
|
||||
candidates = [
|
||||
os.path.join(base_dir, "company.txt"),
|
||||
os.path.join(base_dir, "conpany.txt"),
|
||||
]
|
||||
for fp in candidates:
|
||||
if os.path.exists(fp):
|
||||
try:
|
||||
with open(fp, "r", encoding="utf-8") as f:
|
||||
lines = [ln.strip() for ln in f.readlines()]
|
||||
return [ln for ln in lines if ln]
|
||||
except Exception:
|
||||
continue
|
||||
return []
|
||||
|
||||
|
||||
def _record_success(name: str) -> None:
|
||||
try:
|
||||
n = (name or "").strip()
|
||||
if not n or n in _SUCCESS_WRITTEN:
|
||||
return
|
||||
with open(SUCCESS_LOG_PATH, "a", encoding="utf-8") as f:
|
||||
f.write(f"{n}\n")
|
||||
_SUCCESS_WRITTEN.add(n)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _extract_job_items_from_result(result: Any) -> list:
|
||||
"""从调用结果中提取职位列表 items。"""
|
||||
|
||||
try:
|
||||
if not isinstance(result, dict):
|
||||
return []
|
||||
data = result.get("data")
|
||||
if isinstance(data, dict):
|
||||
lst = data.get("list")
|
||||
return lst if isinstance(lst, list) else []
|
||||
return []
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
import requests
|
||||
|
||||
def _report_universal(items: list, data_type: str = "job") -> bool:
|
||||
"""Post items list to universal batch-store-async endpoint.
|
||||
|
||||
Args:
|
||||
items (list): Data list to send.
|
||||
data_type (str): Logical data type label.
|
||||
|
||||
Returns:
|
||||
bool: True when accepted, else False.
|
||||
"""
|
||||
return False
|
||||
|
||||
|
||||
|
||||
|
||||
def main() -> None:
|
||||
env = os.getenv("ZP_ENV", "prod")
|
||||
page_index = int(os.getenv("ZP_PAGE_INDEX", "1"))
|
||||
page_size = int(os.getenv("ZP_PAGE_SIZE", "15"))
|
||||
city_env = os.getenv("ZP_DEMO_CITY_ID")
|
||||
city_id = int(city_env) if city_env and city_env.isdigit() else None
|
||||
|
||||
items = _load_lines()
|
||||
if items:
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
except Exception:
|
||||
tqdm = None
|
||||
seq = tqdm(items, desc="S_SOU_FULL_INDEX") if tqdm else items
|
||||
db_path = _get_db_path()
|
||||
_init_db(db_path)
|
||||
for x in seq:
|
||||
|
||||
total_items = 0
|
||||
p = page_index
|
||||
pages_bar = tqdm(desc=f"{x}", leave=False) if tqdm else None
|
||||
# 若该关键词已存在任何页的记录,则整体跳过,避免重复请求
|
||||
if _has_keyword_record(db_path, x):
|
||||
if pages_bar:
|
||||
pages_bar.update(0)
|
||||
pages_bar.set_postfix({"keyword": x, "skipped": True})
|
||||
continue
|
||||
while True:
|
||||
if _has_page_record(db_path, x, p):
|
||||
if pages_bar:
|
||||
pages_bar.update(1)
|
||||
pages_bar.set_postfix({"page": p, "skipped": True})
|
||||
_sleep_between_requests()
|
||||
p += 1
|
||||
continue
|
||||
result = call(
|
||||
page_index=p,
|
||||
page_size=page_size,
|
||||
city_id=city_id,
|
||||
env=env,
|
||||
sou_full_index=x,
|
||||
)
|
||||
try:
|
||||
raw = os.getenv("ZP_LAST_RAW", "")
|
||||
items = _extract_job_items_from_result(result)
|
||||
payload_obj = {
|
||||
"keyword": x,
|
||||
"page": p,
|
||||
"count": len(items),
|
||||
"items": items,
|
||||
"data": result.get("data") if isinstance(result, dict) else None,
|
||||
"raw": raw or (json.dumps(result, ensure_ascii=False) if isinstance(result, dict) else str(result)),
|
||||
}
|
||||
_save_search_response(db_path, x, p, json.dumps(payload_obj, ensure_ascii=False))
|
||||
except Exception:
|
||||
pass
|
||||
data = result.get("data") if isinstance(result, dict) else None
|
||||
lst = data.get("list") if isinstance(data, dict) else None
|
||||
is_end = (isinstance(data, dict) and str(data.get("isEndPage", "")).strip() in ("1", "true", "True"))
|
||||
count_val = None
|
||||
# 处理数据字段
|
||||
if lst and isinstance(lst, list):
|
||||
for item in lst:
|
||||
item["companyName"] = x or item.get("companyName", "")
|
||||
item["jobName"] = item.get("jobName", "") or item.get("name", "") or item.get("position", {}).get("base", {}).get("positionName", "")
|
||||
item["jobDescribe"] = item.get("jobSummary", "") or item.get("position", {}).get("desc", {}).get("description", "")
|
||||
item["degreeString"] = item.get("education", "") or item.get("position", {}).get("base", {}).get("education", "")
|
||||
|
||||
tags = []
|
||||
for t in item.get("showSkillTags", []) or []:
|
||||
if isinstance(t, dict):
|
||||
v = t.get("tag") or t.get("value") or t.get("name")
|
||||
if v:
|
||||
tags.append(str(v))
|
||||
elif isinstance(t, str):
|
||||
tags.append(t)
|
||||
for t in item.get("skillLabel", []) or []:
|
||||
if isinstance(t, dict):
|
||||
v = t.get("value") or t.get("name")
|
||||
if v:
|
||||
tags.append(str(v))
|
||||
elif isinstance(t, str):
|
||||
tags.append(t)
|
||||
item["jobTagsForOrder"] = tags
|
||||
|
||||
# 年限/教育
|
||||
item["workYearString"] = item.get("workingExp", "") or item.get("position", {}).get("base", {}).get("positionWorkingExp", "")
|
||||
item["jobExperience"] = item.get("jobExperience", "")
|
||||
item["jobEducation"] = item.get("jobEducation", "")
|
||||
|
||||
# 工种/职位类型
|
||||
item["termStr"] = item.get("workType", "") or item.get("position", {}).get("base", {}).get("workType", "")
|
||||
|
||||
# 位置/区域
|
||||
addr = (item.get("workLocation", {}) or {}).get("workAddress")
|
||||
city = item.get("workCity", "")
|
||||
district = item.get("cityDistrict", "")
|
||||
street = item.get("streetName", "")
|
||||
if not addr:
|
||||
# 卡片JSON中的地址作为兜底
|
||||
card_json = item.get("cardCustomJson")
|
||||
try:
|
||||
card_obj = json.loads(card_json) if isinstance(card_json, str) else {}
|
||||
except Exception:
|
||||
card_obj = {}
|
||||
addr = card_obj.get("address")
|
||||
item["location"] = addr or "" # 详细地址
|
||||
item["jobAreaString"] = f"{city}{district}{street}".strip()
|
||||
|
||||
# 时间
|
||||
item["confirmDateString"] = item.get("publishTime", "") or item.get("firstPublishTime", "")
|
||||
|
||||
# 公司规模/性质
|
||||
item["companySizeString"] = item.get("companySize", "")
|
||||
item["companyTypeString"] = item.get("propertyName", "")
|
||||
|
||||
# 行业
|
||||
item["major1Str"] = item.get("industryName", "")
|
||||
item["major2Str"] = ""
|
||||
|
||||
# 链接/ID/公司信息
|
||||
def _clean_url(u: Any) -> str:
|
||||
s = str(u or "").strip()
|
||||
if s:
|
||||
s = s.strip().strip("`").strip()
|
||||
return s
|
||||
job_url = item.get("positionUrl") or item.get("positionURL") or (item.get("position", {}) or {}).get("base", {}).get("positionUrl")
|
||||
item["jobHref"] = _clean_url(job_url)
|
||||
item["companyHref"] = _clean_url(item.get("companyUrl"))
|
||||
item["coId"] = item.get("companyId")
|
||||
item["fullCompanyName"] = item.get("companyName", "")
|
||||
|
||||
# 薪资:优先 salaryReal,其次 salary,再次 cardCustomJson.salary60
|
||||
raw_salary = item.get("salaryReal") or item.get("salary")
|
||||
if not raw_salary:
|
||||
_cj = item.get("cardCustomJson")
|
||||
try:
|
||||
_cj_obj = json.loads(_cj) if isinstance(_cj, str) else {}
|
||||
except Exception:
|
||||
_cj_obj = {}
|
||||
raw_salary = _cj_obj.get("salary60")
|
||||
min_val = ""
|
||||
max_val = ""
|
||||
if isinstance(raw_salary, str):
|
||||
s = raw_salary.replace("元", "").replace("/月", "").replace("/天", "").replace("/年", "")
|
||||
parts = [p for p in s.split("-") if p.strip()]
|
||||
if len(parts) == 2:
|
||||
try:
|
||||
a = int(parts[0])
|
||||
b = int(parts[1])
|
||||
min_val = str(min(a, b))
|
||||
max_val = str(max(a, b))
|
||||
except Exception:
|
||||
min_val = parts[0].strip()
|
||||
max_val = parts[1].strip()
|
||||
item["jobSalaryMin"] = min_val
|
||||
item["jobSalaryMax"] = max_val
|
||||
|
||||
num = item.get("number")
|
||||
if isinstance(num, str) and num:
|
||||
try:
|
||||
desc_html = fetch_company_desc_by_job(num, db_path=db_path)
|
||||
except Exception:
|
||||
desc_html = None
|
||||
if isinstance(desc_html, str) and desc_html:
|
||||
item["companyDesc"] = desc_html
|
||||
item["company_desc"] = desc_html
|
||||
|
||||
if lst and isinstance(lst, list):
|
||||
pass
|
||||
try:
|
||||
count_val = data.get("count") if isinstance(data, dict) else None
|
||||
except Exception:
|
||||
count_val = None
|
||||
cur_items = len(lst or [])
|
||||
total_items += cur_items
|
||||
if pages_bar:
|
||||
pages_bar.update(1)
|
||||
pages_bar.set_postfix({"page": p, "items": cur_items})
|
||||
|
||||
# 记录成功关键词
|
||||
if cur_items > 0 and total_items == cur_items:
|
||||
_record_success(x)
|
||||
# 结束条件:空列表或服务端标记结束
|
||||
if cur_items == 0 or is_end:
|
||||
break
|
||||
p += 1
|
||||
_sleep_between_requests()
|
||||
if pages_bar:
|
||||
pages_bar.close()
|
||||
if tqdm:
|
||||
seq.set_postfix({"total": total_items})
|
||||
else:
|
||||
result = call(page_index=page_index, page_size=page_size, city_id=city_id, env=env)
|
||||
if isinstance(result, dict):
|
||||
data = result.get("data")
|
||||
|
||||
else:
|
||||
print(str(result)[:200])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.environ['ZP_PROXY_USERNAME']='t13319619426654'
|
||||
os.environ['ZP_PROXY_PASSWORD']='ln8aj9nl'
|
||||
os.environ['ZP_PROXY_TUNNEL']='s432.kdltps.com:15818'
|
||||
main()
|
||||
File diff suppressed because one or more lines are too long
@ -1,782 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
import json
|
||||
import time
|
||||
import random
|
||||
import uuid
|
||||
import hashlib
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
|
||||
import requests
|
||||
import os
|
||||
from loguru import logger
|
||||
from urllib.parse import quote
|
||||
import socket
|
||||
|
||||
|
||||
def sleep_random_between() -> float:
|
||||
try:
|
||||
min_seconds = float(os.getenv("SLEEP_MIN_SECONDS", "1"))
|
||||
max_seconds = float(os.getenv("SLEEP_MAX_SECONDS", "10"))
|
||||
if max_seconds < min_seconds:
|
||||
max_seconds = min_seconds
|
||||
wait_time = random.uniform(min_seconds, max_seconds)
|
||||
except Exception:
|
||||
wait_time = 1.0
|
||||
time.sleep(wait_time)
|
||||
return wait_time
|
||||
|
||||
|
||||
# 固定配置,直接修改以下参数即可运行
|
||||
CITY_ID = 801
|
||||
PAGE_SIZE = 15
|
||||
MAX_PAGES = 15
|
||||
proxy_config = {
|
||||
"username": "t13319619426654",
|
||||
"password": "ln8aj9nl",
|
||||
"tunnel": "s432.kdltps.com:15818"
|
||||
}
|
||||
PROXY = f"http://{proxy_config['username']}:{proxy_config['password']}@{proxy_config['tunnel']}"
|
||||
DEDUP = True
|
||||
API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')
|
||||
|
||||
API_PUBLIC_HOST = os.getenv("API_PUBLIC_HOST")
|
||||
os.makedirs("logs", exist_ok=True)
|
||||
logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)
|
||||
|
||||
|
||||
def log(*args: Any) -> None:
|
||||
"""时间戳日志打印
|
||||
|
||||
Args:
|
||||
*args: 任意要打印的内容
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
logger.info("{} {}", time.strftime("%Y-%m-%d %H:%M:%S"), " ".join(str(a) for a in args))
|
||||
|
||||
|
||||
class ZhilianAPI:
|
||||
"""智联招聘API封装
|
||||
|
||||
Attributes:
|
||||
session: 会话对象
|
||||
proxies: 代理配置
|
||||
"""
|
||||
|
||||
def __init__(self, proxy: Optional[str] = None) -> None:
|
||||
"""初始化
|
||||
|
||||
Args:
|
||||
proxy: 代理地址字符串
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.session = requests.Session()
|
||||
self.proxies = None
|
||||
if proxy:
|
||||
self.proxies = {"http": proxy, "https": proxy}
|
||||
self.session.proxies.update(self.proxies)
|
||||
|
||||
def request_json(self, method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None, timeout: int = 30, max_retries: int = 3,
|
||||
delay_range: Tuple[float, float] = (1.0, 3.0)) -> Optional[Dict[str, Any]]:
|
||||
"""统一请求封装,返回JSON
|
||||
|
||||
Args:
|
||||
method: HTTP方法
|
||||
url: 请求地址
|
||||
headers: 请求头
|
||||
params: 查询参数
|
||||
json_body: JSON请求体
|
||||
timeout: 超时秒数
|
||||
max_retries: 最大重试次数
|
||||
delay_range: 每次请求的随机延迟范围
|
||||
|
||||
Returns:
|
||||
dict|None: JSON响应
|
||||
"""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
sleep_random_between()
|
||||
resp = self.session.request(method.upper(), url, headers=headers, params=params, json=json_body,
|
||||
timeout=timeout)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
logger.info("请求参数 method={} url={} status={} params={} body={} resp_size={}", method.upper(), url, resp.status_code, params or {}, json_body or {}, len(resp.content))
|
||||
logger.info("原始数据 {}", json.dumps(data, ensure_ascii=False))
|
||||
return data
|
||||
except Exception:
|
||||
if attempt == max_retries - 1:
|
||||
return None
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
return None
|
||||
|
||||
def fetch_company_desc_by_job(self, number: str) -> Optional[str]:
|
||||
"""通过职位编号获取公司描述
|
||||
|
||||
Args:
|
||||
number: 职位编号
|
||||
|
||||
Returns:
|
||||
str|None: 公司描述HTML
|
||||
"""
|
||||
client_id = gen_client_id()
|
||||
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
|
||||
params_pc = {
|
||||
"number": number,
|
||||
"_v": gen_v(),
|
||||
"x-zp-page-request-id": gen_page_request_id(),
|
||||
"x-zp-client-id": client_id,
|
||||
}
|
||||
headers_pc = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Encoding": "identity",
|
||||
"sec-ch-ua-platform": "macOS",
|
||||
"x-zp-business-system": "1",
|
||||
"x-zp-page-code": "4019",
|
||||
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"x-zp-platform": "13",
|
||||
"origin": "https://www.zhaopin.com",
|
||||
"sec-fetch-site": "same-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://www.zhaopin.com/",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
'Cookie': f"x-zp-client-id={client_id}"
|
||||
|
||||
}
|
||||
data_pc = self.request_json("GET", url_pc, headers_pc, params=params_pc)
|
||||
if data_pc and isinstance(data_pc, dict):
|
||||
detail = data_pc.get("data") or {}
|
||||
comp = detail.get("detailedCompany") or {}
|
||||
desc_pc = comp.get("companyDescription")
|
||||
if isinstance(desc_pc, str) and desc_pc:
|
||||
return desc_pc
|
||||
ua = _get_user_agent(True)
|
||||
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
|
||||
params_mini = {
|
||||
"number": number,
|
||||
"platform": "12",
|
||||
"version": "0.0.0",
|
||||
}
|
||||
headers_mini = build_headers_miniapp(ua)
|
||||
data_mini = self.request_json("GET", url_mini, headers_mini, params=params_mini)
|
||||
if data_mini and isinstance(data_mini, dict):
|
||||
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
|
||||
if isinstance(desc_mini, str) and desc_mini:
|
||||
return desc_mini
|
||||
return None
|
||||
|
||||
def crawl_pc(self, city_id: int, page_size: int, max_pages: int, dedup: bool,
|
||||
job_level3_code: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
"""PC接口按城市抓取职位
|
||||
|
||||
Args:
|
||||
city_id: 城市ID
|
||||
page_size: 每页数量
|
||||
max_pages: 最大页数
|
||||
dedup: 是否启用本地去重
|
||||
job_level3_code: 三级职位代码
|
||||
|
||||
Returns:
|
||||
list: 抓取的职位列表
|
||||
"""
|
||||
headers = build_headers_pc()
|
||||
base_url = "https://fe-api.zhaopin.com/c/i/search/positions"
|
||||
seen = set()
|
||||
items: List[Dict[str, Any]] = []
|
||||
for page in range(1, max_pages + 1):
|
||||
log("开始抓取PC职位页", {"city_id": city_id, "page": page, "page_size": page_size, "job_level3": job_level3_code or ""})
|
||||
params = {
|
||||
"_v": gen_v(),
|
||||
"x-zp-page-request-id": gen_page_request_id(),
|
||||
"x-zp-client-id": gen_client_id(),
|
||||
}
|
||||
payload = {
|
||||
"S_SOU_WORK_CITY": city_id,
|
||||
"order": 4,
|
||||
"pageSize": page_size,
|
||||
"pageIndex": page,
|
||||
"eventScenario": "pcSearchedSouSearch",
|
||||
"anonymous": 1,
|
||||
"platform": 13,
|
||||
"version": "0.0.0",
|
||||
}
|
||||
if job_level3_code:
|
||||
payload["S_SOU_JD_JOB_LEVEL3"] = job_level3_code
|
||||
data = self.request_json("POST", base_url, headers, params=params, json_body=payload)
|
||||
if not data or data.get("code") != 200:
|
||||
log("抓取失败或返回非200", {"page": page, "resp_code": (data or {}).get("code")})
|
||||
break
|
||||
lst = data.get("data", {}).get("list", [])
|
||||
if not lst:
|
||||
log("该页无职位数据", {"page": page})
|
||||
break
|
||||
page_items: List[Dict[str, Any]] = []
|
||||
for job in lst:
|
||||
jid = job.get("jobId")
|
||||
if dedup and jid in seen:
|
||||
continue
|
||||
if dedup and jid:
|
||||
seen.add(jid)
|
||||
num = job.get("number")
|
||||
if num:
|
||||
desc = self.fetch_company_desc_by_job(str(num)) or ""
|
||||
job["companyDesc"] = desc
|
||||
print(desc)
|
||||
items.append(job)
|
||||
page_items.append(job)
|
||||
log("该页职位数", {"page": page, "count": len(page_items)})
|
||||
if page_items:
|
||||
self.report_data(page_items, "job", "zhilian")
|
||||
log("PC抓取完成", {"total": len(items)})
|
||||
return items
|
||||
|
||||
def get_local_ip(self) -> str:
|
||||
"""获取本地IP地址"""
|
||||
try:
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
local_ip = s.getsockname()[0]
|
||||
s.close()
|
||||
return local_ip
|
||||
except Exception:
|
||||
return "127.0.0.1"
|
||||
|
||||
def report_data(self, data_list: List[Dict[str, Any]], data_type: str, platform: str = "zhilian") -> bool:
|
||||
"""上报数据到远程API
|
||||
|
||||
Args:
|
||||
data_list: 数据列表
|
||||
data_type: 数据类型
|
||||
platform: 平台标识
|
||||
|
||||
Returns:
|
||||
bool: 是否上报成功
|
||||
"""
|
||||
try:
|
||||
universal_data = {
|
||||
"data_list": data_list,
|
||||
"data_type": data_type,
|
||||
"platform": platform
|
||||
}
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
'X-Forwarded-For': self.get_local_ip()
|
||||
}
|
||||
if API_PUBLIC_HOST:
|
||||
headers["Host"] = API_PUBLIC_HOST
|
||||
headers["X-Forwarded-Host"] = API_PUBLIC_HOST
|
||||
api_endpoint = f"{API_BASE_URL}/api/v1/universal/data/batch-store-async"
|
||||
logger.info("REPORT_DATA {}", json.dumps(universal_data, ensure_ascii=False))
|
||||
resp = requests.post(api_endpoint, json=universal_data, headers=headers, timeout=300)
|
||||
ok = 200 <= resp.status_code < 300
|
||||
log("数据上报完成", {"count": len(data_list), "status_code": resp.status_code, "ok": ok})
|
||||
return ok
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _get_user_agent(mobile: bool = True) -> str:
|
||||
"""获取随机User-Agent
|
||||
|
||||
Args:
|
||||
mobile: 是否使用移动端UA
|
||||
|
||||
Returns:
|
||||
str: 随机UA字符串
|
||||
"""
|
||||
try:
|
||||
from fake_useragent import UserAgent
|
||||
ua = UserAgent(platforms=['mobile'] if mobile else None)
|
||||
return ua.random
|
||||
except Exception:
|
||||
if mobile:
|
||||
return "Mozilla/5.0 (Linux; Android 10; Mobile) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Mobile Safari/537.36"
|
||||
return "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||
|
||||
|
||||
def generate_xzp_rt() -> str:
|
||||
"""生成x-zp-rt签名
|
||||
|
||||
Returns:
|
||||
str: rt签名
|
||||
"""
|
||||
unique_string = f"{uuid.uuid4()}-{time.time()}"
|
||||
return hashlib.md5(unique_string.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def random_device_id() -> str:
|
||||
"""生成随机设备ID
|
||||
|
||||
Returns:
|
||||
str: 设备ID(UUID)
|
||||
"""
|
||||
return str(uuid.uuid4()).upper()
|
||||
|
||||
|
||||
def gen_page_request_id() -> str:
|
||||
"""生成页面请求ID
|
||||
|
||||
Returns:
|
||||
str: 请求ID
|
||||
"""
|
||||
return f"cf1e3b3e655b4eb5a306110a83c77c29-{int(time.time()*1000)}-{random.randint(0,999999)}"
|
||||
|
||||
|
||||
def gen_client_id() -> str:
|
||||
"""生成客户端ID
|
||||
|
||||
Returns:
|
||||
str: 客户端ID
|
||||
"""
|
||||
t = int(time.time() * 1000)
|
||||
try:
|
||||
t += int(time.perf_counter() * 1000)
|
||||
except Exception:
|
||||
pass
|
||||
def repl(c: str) -> str:
|
||||
n = int((t + random.random() * 16) % 16)
|
||||
if c == 'x':
|
||||
return hex(n)[2:]
|
||||
return hex((n & 0x3) | 0x8)[2:]
|
||||
tpl = "xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx"
|
||||
return ''.join(repl(c) if c in 'xy' else c for c in tpl)
|
||||
|
||||
|
||||
def gen_v() -> float:
|
||||
"""生成_v参数
|
||||
|
||||
Returns:
|
||||
float: 小于1的随机数
|
||||
"""
|
||||
return round(random.random(), 8)
|
||||
|
||||
|
||||
def build_headers_miniapp(user_agent: str) -> Dict[str, str]:
|
||||
"""构建小程序接口通用请求头
|
||||
|
||||
Args:
|
||||
user_agent: UA字符串
|
||||
|
||||
Returns:
|
||||
dict: 请求头
|
||||
"""
|
||||
return {
|
||||
'User-Agent': user_agent,
|
||||
'x-zp-page-code': "7020",
|
||||
'x-zp-rt': generate_xzp_rt(),
|
||||
'x-zp-device-id': random_device_id(),
|
||||
'content-type': "application/json",
|
||||
'x-zp-version': "0.0.0",
|
||||
'x-zp-business-system': "73",
|
||||
'x-zp-action-id': "",
|
||||
'xweb_xhr': "1",
|
||||
'x-zp-channel': "wxxiaochengxu",
|
||||
'x-zp-platform': "12",
|
||||
'sec-fetch-site': "cross-site",
|
||||
'sec-fetch-mode': "cors",
|
||||
'sec-fetch-dest': "empty",
|
||||
'referer': "https://servicewechat.com/wxb7718fb9257e4fd2/529/page-frame.html",
|
||||
'accept-language': "zh-CN,zh;q=0.9",
|
||||
}
|
||||
|
||||
|
||||
def build_headers_pc() -> Dict[str, str]:
|
||||
"""构建PC接口通用请求头
|
||||
|
||||
Returns:
|
||||
dict: 请求头
|
||||
"""
|
||||
return {
|
||||
"accept": "application/json, text/plain, */*",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"content-type": "application/json;charset=UTF-8",
|
||||
"origin": "https://www.zhaopin.com",
|
||||
"priority": "u=1, i",
|
||||
"referer": "https://www.zhaopin.com/",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"sec-fetch-dest": "empty",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-site": "same-site",
|
||||
"x-zp-page-code": "0",
|
||||
}
|
||||
|
||||
|
||||
def request_json(method: str, url: str, headers: Dict[str, str], params: Optional[Dict[str, Any]] = None,
|
||||
json_body: Optional[Dict[str, Any]] = None, proxies: Optional[str] = None,
|
||||
timeout: int = 30, max_retries: int = 3, delay_range: Tuple[float, float] = (1.0, 3.0)) -> Optional[Dict[str, Any]]:
|
||||
"""统一请求封装,返回JSON
|
||||
|
||||
Args:
|
||||
method: HTTP方法
|
||||
url: 请求地址
|
||||
headers: 请求头
|
||||
params: 查询参数
|
||||
json_body: JSON请求体
|
||||
proxies: 代理地址字符串,如"http://127.0.0.1:7890"
|
||||
timeout: 超时秒数
|
||||
max_retries: 最大重试次数
|
||||
delay_range: 每次请求的随机延迟范围
|
||||
|
||||
Returns:
|
||||
dict|None: JSON响应
|
||||
"""
|
||||
proxy_dict = None
|
||||
if proxies:
|
||||
proxy_dict = {"http": proxies, "https": proxies}
|
||||
try:
|
||||
logger.info("USE_PROXY_TUNNEL {}", proxies.split("@")[1])
|
||||
except Exception:
|
||||
logger.info("USE_PROXY_ENABLED")
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
sleep_random_between()
|
||||
resp = requests.request(method.upper(), url, headers=headers, params=params, json=json_body,
|
||||
timeout=timeout, proxies=proxy_dict)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
logger.info("请求参数 method={} url={} status={} params={} body={} resp_size={}", method.upper(), url, resp.status_code, params or {}, json_body or {}, len(resp.content))
|
||||
logger.info("原始数据 {}", json.dumps(data, ensure_ascii=False))
|
||||
return data
|
||||
except Exception:
|
||||
if attempt == max_retries - 1:
|
||||
return None
|
||||
time.sleep(1.5 * (attempt + 1))
|
||||
return None
|
||||
|
||||
|
||||
def fetch_company_desc_by_job(number: str, proxies: Optional[str] = None) -> Optional[str]:
|
||||
client_id = gen_client_id()
|
||||
url_pc = "https://fe-api.zhaopin.com/c/i/jobs/position-detail-new"
|
||||
params_pc = {
|
||||
"number": number,
|
||||
"_v": gen_v(),
|
||||
"x-zp-page-request-id": gen_page_request_id(),
|
||||
"x-zp-client-id": client_id,
|
||||
}
|
||||
headers_pc = {
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36",
|
||||
"Accept": "application/json, text/plain, */*",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"sec-ch-ua-platform": "macOS",
|
||||
"x-zp-business-system": "1",
|
||||
"x-zp-page-code": "4019",
|
||||
"sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"136\", \"Google Chrome\";v=\"136\"",
|
||||
"sec-ch-ua-mobile": "?0",
|
||||
"x-zp-platform": "13",
|
||||
"origin": "https://www.zhaopin.com",
|
||||
"sec-fetch-site": "same-site",
|
||||
"sec-fetch-mode": "cors",
|
||||
"sec-fetch-dest": "empty",
|
||||
"referer": "https://www.zhaopin.com/",
|
||||
"accept-language": "zh-CN,zh;q=0.9",
|
||||
"priority": "u=1, i",
|
||||
"Cookie": f"x-zp-client-id={client_id}"
|
||||
}
|
||||
data_pc = request_json("GET", url_pc, headers_pc, params=params_pc, proxies=proxies)
|
||||
if data_pc and isinstance(data_pc, dict):
|
||||
detail = data_pc.get("data") or {}
|
||||
comp = detail.get("detailedCompany") or {}
|
||||
desc_pc = comp.get("companyDescription")
|
||||
if isinstance(desc_pc, str) and desc_pc:
|
||||
return desc_pc
|
||||
ua = _get_user_agent(True)
|
||||
url_mini = "https://cgate.zhaopin.com/positionbusiness/exposure/companyDetail"
|
||||
params_mini = {
|
||||
"number": number,
|
||||
"platform": "12",
|
||||
"version": "0.0.0",
|
||||
}
|
||||
headers_mini = build_headers_miniapp(ua)
|
||||
data_mini = request_json("GET", url_mini, headers_mini, params=params_mini, proxies=proxies)
|
||||
if data_mini and isinstance(data_mini, dict):
|
||||
desc_mini = ((data_mini.get("data") or {}).get("companyBase") or {}).get("companyDescWithHtml")
|
||||
if isinstance(desc_mini, str) and desc_mini:
|
||||
return desc_mini
|
||||
return None
|
||||
|
||||
|
||||
def load_work_data(path: Optional[str] = None) -> Optional[Dict[str, Any]]:
|
||||
"""加载work.json数据
|
||||
|
||||
Args:
|
||||
path: 文件路径;默认与脚本同目录的work.json
|
||||
|
||||
Returns:
|
||||
dict|None: 解析后的数据
|
||||
"""
|
||||
try:
|
||||
if not path:
|
||||
path = os.path.join(os.path.dirname(__file__), "work.json")
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def pick_random_city(work: Dict[str, Any]) -> Optional[Tuple[int, str]]:
|
||||
"""从work.json中随机挑选一个城市ID(尽量为PC接口可用的数字)
|
||||
|
||||
Args:
|
||||
work: work.json数据
|
||||
|
||||
Returns:
|
||||
(int, str)|None: 城市ID与名称
|
||||
"""
|
||||
candidates: List[Tuple[int, str]] = []
|
||||
data = work.get("data") if isinstance(work, dict) else None
|
||||
if data:
|
||||
# 优先寻找明显的城市列表字段
|
||||
for key in ("cities", "city", "workCity", "subway"):
|
||||
lst = data.get(key)
|
||||
if isinstance(lst, list):
|
||||
for item in lst:
|
||||
code = item.get("cityId") or item.get("code")
|
||||
name = item.get("name")
|
||||
if isinstance(code, int) and isinstance(name, str):
|
||||
candidates.append((code, name))
|
||||
elif isinstance(code, str) and code.isdigit() and isinstance(name, str):
|
||||
candidates.append((int(code), name))
|
||||
candidates = [(cid, nm) for cid, nm in candidates if 1 <= cid <= 999999 and nm]
|
||||
if candidates:
|
||||
return random.choice(candidates)
|
||||
return None
|
||||
|
||||
|
||||
def pick_random_job_level3(work: Dict[str, Any]) -> Optional[Tuple[str, str]]:
|
||||
"""从work.json中随机挑选一个三级职位代码(S_SOU_JD_JOB_LEVEL3)
|
||||
|
||||
Args:
|
||||
work: work.json数据
|
||||
|
||||
Returns:
|
||||
(str, str)|None: 三级职位代码与名称
|
||||
"""
|
||||
codes: List[Tuple[str, str]] = []
|
||||
def walk(obj: Any) -> None:
|
||||
if isinstance(obj, dict):
|
||||
val = obj.get("code")
|
||||
nm = obj.get("name")
|
||||
if isinstance(val, str):
|
||||
s = val.strip()
|
||||
if s and s != "不限" and any(ch.isdigit() for ch in s) and len(s) >= 8 and isinstance(nm, str):
|
||||
codes.append((s, nm))
|
||||
for v in obj.values():
|
||||
walk(v)
|
||||
elif isinstance(obj, list):
|
||||
for it in obj:
|
||||
walk(it)
|
||||
data = work.get("data") if isinstance(work, dict) else None
|
||||
if data:
|
||||
walk(data)
|
||||
pure = [(c, n) for c, n in codes if c.isdigit()]
|
||||
if pure:
|
||||
return random.choice(pure)
|
||||
if codes:
|
||||
parts = [(c.split(";")[0], n) for c, n in codes if ";" in c]
|
||||
if parts:
|
||||
return random.choice(parts)
|
||||
return random.choice(codes)
|
||||
return None
|
||||
|
||||
|
||||
def fetch_service_params() -> Optional[Tuple[int, Optional[str]]]:
|
||||
"""从服务端获取当天未使用的城市/职位并占用
|
||||
|
||||
返回:
|
||||
(city_id, job_level3_code|None) 或 None
|
||||
"""
|
||||
try:
|
||||
url = f"{API_BASE_URL}/api/v1/keyword/available"
|
||||
r = requests.get(url, params={"source": "zhilian", "limit": 1}, timeout=10)
|
||||
if r.status_code != 200:
|
||||
return None
|
||||
js = r.json()
|
||||
data = js.get("data") or {}
|
||||
items = data.get("items") or []
|
||||
if not items:
|
||||
return None
|
||||
item = items[0]
|
||||
ids = [item.get("id")]
|
||||
if ids and ids[0]:
|
||||
try:
|
||||
murl = f"{API_BASE_URL}/api/v1/keyword/mark-used"
|
||||
requests.post(murl, json={"source": "zhilian", "ids": ids}, timeout=10)
|
||||
except Exception:
|
||||
pass
|
||||
city_raw = item.get("city")
|
||||
job_code = item.get("job")
|
||||
try:
|
||||
city_id = int(str(city_raw))
|
||||
except Exception:
|
||||
return None
|
||||
job_code = str(job_code) if job_code else None
|
||||
return (city_id, job_code)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
|
||||
def crawl_pc(city_id: int, page_size: int, max_pages: int, proxies: Optional[str], dedup: bool, job_level3_code: Optional[str] = None) -> None:
|
||||
"""PC接口按城市抓取职位
|
||||
|
||||
Args:
|
||||
city_id: 城市ID
|
||||
page_size: 每页数量
|
||||
max_pages: 最大页数
|
||||
proxies: 代理地址
|
||||
output: 输出文件路径(JSONL),为空则打印
|
||||
dedup: 是否启用本地去重
|
||||
"""
|
||||
headers = build_headers_pc()
|
||||
base_url = "https://fe-api.zhaopin.com/c/i/search/positions"
|
||||
seen = set()
|
||||
items = []
|
||||
for page in range(1, max_pages + 1):
|
||||
log("开始抓取PC职位页", {"city_id": city_id, "page": page, "page_size": page_size, "job_level3": job_level3_code or ""})
|
||||
params = {
|
||||
"_v": gen_v(),
|
||||
"x-zp-page-request-id": gen_page_request_id(),
|
||||
"x-zp-client-id": gen_client_id(),
|
||||
}
|
||||
payload = {
|
||||
"S_SOU_WORK_CITY": city_id,
|
||||
"order": 4,
|
||||
"pageSize": page_size,
|
||||
"pageIndex": page,
|
||||
"eventScenario": "pcSearchedSouSearch",
|
||||
"anonymous": 1,
|
||||
"platform": 13,
|
||||
"version": "0.0.0",
|
||||
}
|
||||
if job_level3_code:
|
||||
payload["S_SOU_JD_JOB_LEVEL3"] = job_level3_code
|
||||
data = request_json("POST", base_url, headers, params=params, json_body=payload, proxies=proxies)
|
||||
if not data or data.get("code") != 200:
|
||||
log("抓取失败或返回非200", {"page": page, "resp_code": (data or {}).get("code")})
|
||||
break
|
||||
lst = data.get("data", {}).get("list", [])
|
||||
print(lst)
|
||||
if not lst:
|
||||
log("该页无职位数据", {"page": page})
|
||||
break
|
||||
page_items = []
|
||||
for job in lst:
|
||||
jid = job.get("jobId")
|
||||
if dedup and jid in seen:
|
||||
continue
|
||||
if dedup and jid:
|
||||
seen.add(jid)
|
||||
|
||||
# 在这里加一个 公司获取的 描述
|
||||
num = job.get("jobId") or job.get("number")
|
||||
if num:
|
||||
desc = fetch_company_desc_by_job(str(num), proxies) or ""
|
||||
job["companyDesc"] = desc
|
||||
items.append(job)
|
||||
page_items.append(job)
|
||||
log("该页职位数", {"page": page, "count": len(page_items)})
|
||||
if page_items:
|
||||
report_data(page_items, "job", "zhilian")
|
||||
log("PC抓取完成", {"total": len(items)})
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def report_data(data_list: List[Dict[str, Any]], data_type: str, platform: str = "zhilian") -> bool:
|
||||
"""上报数据到远程API
|
||||
|
||||
Args:
|
||||
data_list: 数据列表
|
||||
data_type: 数据类型
|
||||
platform: 平台标识
|
||||
|
||||
Returns:
|
||||
bool: 是否上报成功
|
||||
"""
|
||||
try:
|
||||
universal_data = {
|
||||
"data_list": data_list,
|
||||
"data_type": data_type,
|
||||
"platform": platform
|
||||
}
|
||||
headers = {
|
||||
"accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
'X-Forwarded-For': get_local_ip()
|
||||
}
|
||||
if API_PUBLIC_HOST:
|
||||
headers["Host"] = API_PUBLIC_HOST
|
||||
headers["X-Forwarded-Host"] = API_PUBLIC_HOST
|
||||
api_endpoint = f"{API_BASE_URL}/api/v1/universal/data/batch-store-async"
|
||||
resp = requests.post(api_endpoint, json=universal_data, headers=headers, timeout=300)
|
||||
ok = 200 <= resp.status_code < 300
|
||||
log("数据上报完成", {"count": len(data_list), "status_code": resp.status_code, "ok": ok})
|
||||
return ok
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""脚本入口
|
||||
|
||||
执行不同模式的抓取流程并输出结果
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
work = load_work_data()
|
||||
api = ZhilianAPI(PROXY)
|
||||
while True:
|
||||
svc = fetch_service_params()
|
||||
if svc:
|
||||
city_id, job_code = svc
|
||||
city_name = None
|
||||
job_name = None
|
||||
else:
|
||||
city_id = CITY_ID
|
||||
city_name = None
|
||||
if work:
|
||||
rnd_city = pick_random_city(work)
|
||||
if isinstance(rnd_city, tuple):
|
||||
city_id, city_name = rnd_city
|
||||
job_code = None
|
||||
job_name = None
|
||||
if work:
|
||||
rnd_job = pick_random_job_level3(work)
|
||||
if isinstance(rnd_job, tuple):
|
||||
job_code, job_name = rnd_job
|
||||
log("开始一轮抓取", {"city_id": city_id, "city_name": city_name or "", "job_code": job_code or "", "job_name": job_name or ""})
|
||||
try:
|
||||
api.crawl_pc(city_id, PAGE_SIZE, MAX_PAGES, DEDUP, job_code)
|
||||
except Exception:
|
||||
pass
|
||||
sleep_random_between()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
def get_local_ip() -> str:
|
||||
"""获取本地IP地址
|
||||
|
||||
Returns:
|
||||
str: 本地IP地址,失败时返回127.0.0.1
|
||||
"""
|
||||
try:
|
||||
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
s.connect(("8.8.8.8", 80))
|
||||
local_ip = s.getsockname()[0]
|
||||
s.close()
|
||||
return local_ip
|
||||
except Exception:
|
||||
return "127.0.0.1"
|
||||
Loading…
x
Reference in New Issue
Block a user