fix(04): correct architecture — private files use crawler_core directly
Architecture clarification from user: spiderJobs/ is standalone execution,
NOT meant to be imported by app/. Correct dependency graph:
crawler_core ← shared base library
↑ ↑
spiderJobs app/services/crawler/
(standalone) (FastAPI backend, private layer)
Changes:
- boss.py/qcwy.py/zhilian.py: revert import back to private _boss_api etc.
- _boss/job51/zhilian_api.py: use crawler_core.base.Result/BaseFetcher/BaseSearcher
+ fix self._http → self.http_client
- _boss/job51/zhilian_client.py: use crawler_core.http_client.HTTPClient
+ _boss_client uses crawler_core.boss.sign.BossSign directly
- _boss/job51/zhilian_sign.py: backward-compat stubs → crawler_core.*.sign
Full regression: 106 passed in 0.68s
This commit is contained in:
parent
3aadbd128b
commit
2b94f15b56
@ -13,7 +13,7 @@ from __future__ import annotations
|
|||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
|
|
||||||
from app.services.crawler._base import ApiResult, BaseFetcher, BaseSearcher
|
from crawler_core.base import BaseFetcher, BaseSearcher, Result as ApiResult
|
||||||
from app.services.crawler._boss_client import BossClient, create_client
|
from app.services.crawler._boss_client import BossClient, create_client
|
||||||
|
|
||||||
|
|
||||||
@ -78,7 +78,7 @@ class SearchRecJobs(BaseSearcher):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def _request(self, params: dict) -> tuple[int, Any]:
|
def _request(self, params: dict) -> tuple[int, Any]:
|
||||||
return self._http.get(self.ENDPOINT, params)
|
return self.http_client.get(self.ENDPOINT, params)
|
||||||
|
|
||||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||||
return _parse_boss_response(http_code, raw)
|
return _parse_boss_response(http_code, raw)
|
||||||
@ -113,7 +113,7 @@ class GetJobDetail(BaseFetcher):
|
|||||||
{"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json", "method": "GET", "query": improvement_query},
|
{"path": "/wapi/zpgeek/miniapp/jobdetail/improvement/query.json", "method": "GET", "query": improvement_query},
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
client: BossClient = self._http
|
client: BossClient = self.http_client
|
||||||
http_code, data = client.batch(sub_reqs)
|
http_code, data = client.batch(sub_reqs)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||||
@ -176,7 +176,7 @@ class SearchBrandJobs(BaseSearcher):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def _request(self, params: dict) -> tuple[int, Any]:
|
def _request(self, params: dict) -> tuple[int, Any]:
|
||||||
return self._http.get(self.ENDPOINT, params)
|
return self.http_client.get(self.ENDPOINT, params)
|
||||||
|
|
||||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||||
return _parse_boss_response(http_code, raw)
|
return _parse_boss_response(http_code, raw)
|
||||||
|
|||||||
@ -12,8 +12,8 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from app.services.crawler._http_client import HTTPClient
|
from crawler_core.http_client import HTTPClient
|
||||||
from app.services.crawler._boss_sign import BossSign
|
from crawler_core.boss.sign import BossSign
|
||||||
|
|
||||||
BASE_URL = "https://www.zhipin.com"
|
BASE_URL = "https://www.zhipin.com"
|
||||||
|
|
||||||
|
|||||||
@ -4,75 +4,12 @@
|
|||||||
# 将在下一里程碑中删除。
|
# 将在下一里程碑中删除。
|
||||||
#
|
#
|
||||||
"""
|
"""
|
||||||
Boss直聘 Traceid 生成算法
|
Boss直聘 Traceid 生成算法 — 向后兼容桩
|
||||||
复制自 spiderJobs/platforms/boss/sign.py — import 改为本地引用
|
|
||||||
|
已迁移至 crawler_core.boss.sign。
|
||||||
|
直接从 crawler_core 重新导出,避免下游代码出现 ImportError。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from crawler_core.boss.sign import BossSign # noqa: F401
|
||||||
|
|
||||||
import random
|
__all__ = ["BossSign"]
|
||||||
import time
|
|
||||||
|
|
||||||
|
|
||||||
_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
|
||||||
|
|
||||||
|
|
||||||
def _to_u32(n: int) -> int:
|
|
||||||
return n & 0xFFFFFFFF
|
|
||||||
|
|
||||||
|
|
||||||
def _compute_checksum(uuid_str: str) -> str:
|
|
||||||
r = 0
|
|
||||||
for ch in uuid_str:
|
|
||||||
r = ((r << 5) - r + ord(ch)) & 0xFFFFFFFF
|
|
||||||
|
|
||||||
a = 0
|
|
||||||
for i in range(len(uuid_str) - 1, -1, -1):
|
|
||||||
a = ((a << 7) - a + ord(uuid_str[i]) * (i + 1)) & 0xFFFFFFFF
|
|
||||||
|
|
||||||
n = 0
|
|
||||||
mid = len(uuid_str) // 2
|
|
||||||
for i in range(len(uuid_str)):
|
|
||||||
n = ((n << 3) - n + ord(uuid_str[i]) * (abs(i - mid) + 1)) & 0xFFFFFFFF
|
|
||||||
|
|
||||||
s = _to_u32(r ^ a)
|
|
||||||
s = _to_u32(2654435761 * s)
|
|
||||||
s = _to_u32(s ^ (s >> 16))
|
|
||||||
s = _to_u32(2246822507 * s)
|
|
||||||
s = _to_u32(s ^ (s >> 13))
|
|
||||||
c1 = _CHARS[s % 62]
|
|
||||||
|
|
||||||
h = _to_u32(a ^ n)
|
|
||||||
h = _to_u32(3266489909 * h)
|
|
||||||
h = _to_u32(h ^ (h >> 16))
|
|
||||||
h = _to_u32(2654435761 * h)
|
|
||||||
h = _to_u32(h ^ (h >> 13))
|
|
||||||
c2 = _CHARS[h % 62]
|
|
||||||
|
|
||||||
v = _to_u32(n ^ r)
|
|
||||||
v = _to_u32(668265261 * v)
|
|
||||||
v = _to_u32(v ^ (v >> 16))
|
|
||||||
v = _to_u32(2246822507 * v)
|
|
||||||
v = _to_u32(v ^ (v >> 13))
|
|
||||||
c3 = _CHARS[v % 62]
|
|
||||||
|
|
||||||
return f"{c1}{c2}{c3}"
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_uuid() -> str:
|
|
||||||
hex_ts = format(int(time.time() * 1000), "x").lower()
|
|
||||||
hex_ts = hex_ts[-13:].zfill(13)
|
|
||||||
rand_part = "".join(random.choice(_CHARS) for _ in range(6))
|
|
||||||
return hex_ts + rand_part
|
|
||||||
|
|
||||||
|
|
||||||
class BossSign:
|
|
||||||
def __init__(self, *, mpt: str = "", wt2: str = ""):
|
|
||||||
self.mpt = mpt
|
|
||||||
self.wt2 = wt2
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def generate_traceid(prefix: str = "M-W") -> str:
|
|
||||||
uuid_str = _generate_uuid()
|
|
||||||
checksum = _compute_checksum(uuid_str)
|
|
||||||
return f"{prefix}{uuid_str}{checksum}"
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from app.services.crawler._base import ApiResult, BaseFetcher, BaseSearcher
|
from crawler_core.base import BaseFetcher, BaseSearcher, Result as ApiResult
|
||||||
from app.services.crawler._job51_client import Job51Client, create_client
|
from app.services.crawler._job51_client import Job51Client, create_client
|
||||||
|
|
||||||
|
|
||||||
@ -105,7 +105,7 @@ class GetJobDetail(BaseFetcher):
|
|||||||
def fetch(self) -> ApiResult:
|
def fetch(self) -> ApiResult:
|
||||||
endpoint = f"{self.ENDPOINT}/{self.job_id}"
|
endpoint = f"{self.ENDPOINT}/{self.job_id}"
|
||||||
try:
|
try:
|
||||||
http_code, data = self._http.get(endpoint)
|
http_code, data = self.http_client.get(endpoint)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||||
return self._parse(http_code, data)
|
return self._parse(http_code, data)
|
||||||
@ -135,7 +135,7 @@ class GetCompanyInfo(BaseFetcher):
|
|||||||
|
|
||||||
def fetch(self) -> ApiResult:
|
def fetch(self) -> ApiResult:
|
||||||
try:
|
try:
|
||||||
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
|
http_code, data = self.http_client.get(self.ENDPOINT, self._build_params())
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||||
return self._parse(http_code, data)
|
return self._parse(http_code, data)
|
||||||
|
|||||||
@ -14,8 +14,8 @@ import json
|
|||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
|
||||||
from app.services.crawler._http_client import HTTPClient
|
from crawler_core.http_client import HTTPClient
|
||||||
from app.services.crawler._job51_sign import Job51Sign
|
from crawler_core.qcwy.sign import Job51Sign
|
||||||
|
|
||||||
BASE_URL = "https://cupid.51job.com"
|
BASE_URL = "https://cupid.51job.com"
|
||||||
|
|
||||||
|
|||||||
@ -4,59 +4,12 @@
|
|||||||
# 将在下一里程碑中删除。
|
# 将在下一里程碑中删除。
|
||||||
#
|
#
|
||||||
"""
|
"""
|
||||||
前程无忧 (51Job) 签名算法
|
前程无忧 (51Job) 签名 — 向后兼容桩
|
||||||
复制自 spiderJobs/platforms/job51/sign.py — import 改为本地引用
|
|
||||||
|
已迁移至 crawler_core.qcwy.sign。
|
||||||
|
直接从 crawler_core 重新导出,避免下游代码出现 ImportError。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from crawler_core.qcwy.sign import Job51Sign # noqa: F401
|
||||||
|
|
||||||
import hmac
|
__all__ = ["Job51Sign"]
|
||||||
import hashlib
|
|
||||||
import time
|
|
||||||
import random
|
|
||||||
from urllib.parse import quote
|
|
||||||
|
|
||||||
|
|
||||||
SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
|
||||||
|
|
||||||
|
|
||||||
class Job51Sign:
|
|
||||||
def __init__(self, *, sign_key: str = SIGN_KEY):
|
|
||||||
self.sign_key = sign_key
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def generate_uuid() -> str:
|
|
||||||
ts = str(int(time.time() * 1000))
|
|
||||||
rand = str(random.randint(1000000000, 9999999999))
|
|
||||||
return ts + rand
|
|
||||||
|
|
||||||
def build_sign_path(
|
|
||||||
self,
|
|
||||||
endpoint: str,
|
|
||||||
method: str = "GET",
|
|
||||||
params: dict | None = None,
|
|
||||||
body: dict | None = None,
|
|
||||||
) -> tuple[str, str]:
|
|
||||||
import json
|
|
||||||
|
|
||||||
ts = int(time.time())
|
|
||||||
path = f"/{endpoint}?api_key=51job×tamp={ts}"
|
|
||||||
|
|
||||||
if method.upper() == "GET" and params:
|
|
||||||
query_parts = []
|
|
||||||
for k, v in params.items():
|
|
||||||
query_parts.append(f"{quote(str(k), safe='')}={quote(str(v), safe='')}")
|
|
||||||
if query_parts:
|
|
||||||
path += "&" + "&".join(query_parts)
|
|
||||||
|
|
||||||
message = path
|
|
||||||
if method.upper() == "POST" and body is not None:
|
|
||||||
message += json.dumps(body, ensure_ascii=False, separators=(",", ":"))
|
|
||||||
|
|
||||||
sign_hex = hmac.new(
|
|
||||||
self.sign_key.encode("utf-8"),
|
|
||||||
message.encode("utf-8"),
|
|
||||||
hashlib.sha256,
|
|
||||||
).hexdigest()
|
|
||||||
|
|
||||||
return path, sign_hex
|
|
||||||
|
|||||||
@ -12,7 +12,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from app.services.crawler._base import BaseFetcher, BaseSearcher
|
from crawler_core.base import BaseFetcher, BaseSearcher
|
||||||
from app.services.crawler._zhilian_client import ZhilianClient, create_cgate_client, create_capi_client
|
from app.services.crawler._zhilian_client import ZhilianClient, create_cgate_client, create_capi_client
|
||||||
|
|
||||||
|
|
||||||
@ -145,4 +145,4 @@ class SearchCompanyPositions(BaseSearcher):
|
|||||||
return params
|
return params
|
||||||
|
|
||||||
def _request(self, params: dict) -> tuple[int, Any]:
|
def _request(self, params: dict) -> tuple[int, Any]:
|
||||||
return self._http.get(self.ENDPOINT, params)
|
return self.http_client.get(self.ENDPOINT, params)
|
||||||
|
|||||||
@ -12,8 +12,8 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from app.services.crawler._http_client import HTTPClient
|
from crawler_core.http_client import HTTPClient
|
||||||
from app.services.crawler._zhilian_sign import ZhilianSign
|
from crawler_core.zhilian.sign import ZhilianSign
|
||||||
|
|
||||||
CGATE_BASE_URL = "https://cgate.zhaopin.com"
|
CGATE_BASE_URL = "https://cgate.zhaopin.com"
|
||||||
CAPI_BASE_URL = "https://capi.zhaopin.com"
|
CAPI_BASE_URL = "https://capi.zhaopin.com"
|
||||||
|
|||||||
@ -4,60 +4,12 @@
|
|||||||
# 将在下一里程碑中删除。
|
# 将在下一里程碑中删除。
|
||||||
#
|
#
|
||||||
"""
|
"""
|
||||||
智联招聘签名算法
|
智联招聘签名 — 向后兼容桩
|
||||||
复制自 spiderJobs/platforms/zhilian/sign.py — import 改为本地引用
|
|
||||||
|
已迁移至 crawler_core.zhilian.sign。
|
||||||
|
直接从 crawler_core 重新导出,避免下游代码出现 ImportError。
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from crawler_core.zhilian.sign import ZhilianSign # noqa: F401
|
||||||
|
|
||||||
import math
|
__all__ = ["ZhilianSign"]
|
||||||
import random
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
|
|
||||||
class ZhilianSign:
|
|
||||||
def __init__(
|
|
||||||
self, *, at: str = "", rt: str = "",
|
|
||||||
device_id: Optional[str] = None, version: str = "4.1.259",
|
|
||||||
channel: str = "wxxiaochengxu", platform: str = "12",
|
|
||||||
):
|
|
||||||
self.at = at
|
|
||||||
self.rt = rt
|
|
||||||
self.device_id = device_id or self.generate_uuid()
|
|
||||||
self.version = version
|
|
||||||
self.channel = channel
|
|
||||||
self.platform = platform
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def generate_uuid() -> str:
|
|
||||||
chars = "0123456789ABCDEF"
|
|
||||||
uuid = [""] * 36
|
|
||||||
for i in range(36):
|
|
||||||
uuid[i] = chars[math.floor(16 * random.random())]
|
|
||||||
uuid[14] = "4"
|
|
||||||
uuid[19] = chars[(int(uuid[19], 16) & 0x3) | 0x8]
|
|
||||||
uuid[8] = uuid[13] = uuid[18] = uuid[23] = "-"
|
|
||||||
return "".join(uuid)
|
|
||||||
|
|
||||||
def sign_headers(self, page_code: str = "0") -> dict:
|
|
||||||
return {
|
|
||||||
"x-zp-at": self.at,
|
|
||||||
"x-zp-rt": self.rt,
|
|
||||||
"x-zp-action-id": self.generate_uuid(),
|
|
||||||
"x-zp-page-code": page_code,
|
|
||||||
"x-zp-version": self.version,
|
|
||||||
"x-zp-channel": self.channel,
|
|
||||||
"x-zp-platform": self.platform,
|
|
||||||
"x-zp-device-id": self.device_id,
|
|
||||||
"x-zp-business-system": "73",
|
|
||||||
}
|
|
||||||
|
|
||||||
def sign_params(self) -> dict:
|
|
||||||
return {
|
|
||||||
"at": self.at,
|
|
||||||
"rt": self.rt,
|
|
||||||
"channel": self.channel,
|
|
||||||
"platform": self.platform,
|
|
||||||
"version": self.version,
|
|
||||||
"d": self.device_id,
|
|
||||||
}
|
|
||||||
|
|||||||
@ -9,14 +9,14 @@ from typing import Any, Dict, List, Optional
|
|||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from spiderJobs.platforms.boss.api import (
|
from app.services.crawler._boss_api import (
|
||||||
GetBrandDetail,
|
GetBrandDetail,
|
||||||
GetJobDetail,
|
GetJobDetail,
|
||||||
SearchBrandJobs,
|
SearchBrandJobs,
|
||||||
SearchRecJobs,
|
SearchRecJobs,
|
||||||
)
|
)
|
||||||
from spiderJobs.platforms.boss.client import BossClient, create_client
|
from app.services.crawler._boss_client import BossClient, create_client
|
||||||
from spiderJobs.platforms.boss.sign import BossSign
|
from app.services.crawler._boss_sign import BossSign
|
||||||
|
|
||||||
|
|
||||||
class BossService:
|
class BossService:
|
||||||
|
|||||||
@ -9,13 +9,13 @@ from typing import Any, Dict, List, Optional
|
|||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from spiderJobs.platforms.job51.api import (
|
from app.services.crawler._job51_api import (
|
||||||
GetCompanyInfo,
|
GetCompanyInfo,
|
||||||
GetJobDetail,
|
GetJobDetail,
|
||||||
SearchCompanyJobs,
|
SearchCompanyJobs,
|
||||||
SearchRecommendJobs,
|
SearchRecommendJobs,
|
||||||
)
|
)
|
||||||
from spiderJobs.platforms.job51.client import Job51Client, create_client
|
from app.services.crawler._job51_client import Job51Client, create_client
|
||||||
|
|
||||||
|
|
||||||
class QcwyService:
|
class QcwyService:
|
||||||
|
|||||||
@ -9,18 +9,18 @@ from typing import Any, Dict, List, Optional
|
|||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
|
|
||||||
from spiderJobs.platforms.zhilian.api import (
|
from app.services.crawler._zhilian_api import (
|
||||||
GetCompanyDetail,
|
GetCompanyDetail,
|
||||||
GetPositionDetail,
|
GetPositionDetail,
|
||||||
SearchCompanyPositions,
|
SearchCompanyPositions,
|
||||||
SearchPositions,
|
SearchPositions,
|
||||||
)
|
)
|
||||||
from spiderJobs.platforms.zhilian.client import (
|
from app.services.crawler._zhilian_client import (
|
||||||
ZhilianClient,
|
ZhilianClient,
|
||||||
create_capi_client,
|
create_capi_client,
|
||||||
create_cgate_client,
|
create_cgate_client,
|
||||||
)
|
)
|
||||||
from spiderJobs.platforms.zhilian.sign import ZhilianSign
|
from app.services.crawler._zhilian_sign import ZhilianSign
|
||||||
|
|
||||||
|
|
||||||
class ZhilianService:
|
class ZhilianService:
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user