From b27686a409917af393f3a63a310165cdb3633010 Mon Sep 17 00:00:00 2001 From: win Date: Sat, 21 Mar 2026 17:45:14 +0800 Subject: [PATCH] docs(01-shared-core): create phase 1 plans for crawler_core shared package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Plan 01-01 (Wave 1): Package scaffold with HTTPClient + tenacity retry (min=10s) + stdlib logging + BaseFetcher/BaseSearcher base classes + pyproject.toml. Covers ARCH-01, ARCH-02, QUAL-04, QUAL-05. Plan 01-02 (Wave 2): Sign algorithm migration (Boss/Job51/Zhilian) to crawler_core/ + comprehensive unit tests — no HTTP, no mocks, pure functions. Covers QUAL-01. 24+ test cases across 3 test files. ROADMAP updated: Phase 1 now shows 2 concrete plans instead of TBD. --- .planning/ROADMAP.md | 9 +- .planning/phases/01-shared-core/01-01-PLAN.md | 516 ++++++++++++++ .planning/phases/01-shared-core/01-02-PLAN.md | 663 ++++++++++++++++++ 3 files changed, 1185 insertions(+), 3 deletions(-) create mode 100644 .planning/phases/01-shared-core/01-01-PLAN.md create mode 100644 .planning/phases/01-shared-core/01-02-PLAN.md diff --git a/.planning/ROADMAP.md b/.planning/ROADMAP.md index 7d88466..3574bf5 100644 --- a/.planning/ROADMAP.md +++ b/.planning/ROADMAP.md @@ -30,7 +30,10 @@ 3. BaseFetcher/BaseSearcher 基类定义完整,提供模板方法供子类实现 4. loguru 日志格式统一,tenacity 重试装饰器可开箱即用 5. 旧爬虫(jobs_spider/)仍正常运行,未被改动(feature flag 隔离) -**Plans:** TBD +**Plans:** 2 plans +Plans: +- [ ] 01-01-PLAN.md — Package scaffold: pyproject.toml + HTTPClient (tenacity + logging) + BaseFetcher/BaseSearcher + Pipfile deps +- [ ] 01-02-PLAN.md — Sign algorithms: port BossSign/Job51Sign/ZhilianSign to crawler_core/ + unit tests (pytest) ### Phase 2: Boss 直聘重写 **Goal:** Boss 直聘爬虫完全基于 crawler_core 运行,旧实现可安全停用 @@ -93,7 +96,7 @@ | Phase | Plans Complete | Status | Completed | |-------|----------------|--------|-----------| -| 1. 共享核心包 | 0/? | Not started | - | +| 1. 共享核心包 | 0/2 | In progress | - | | 2. Boss 直聘重写 | 0/? | Not started | - | | 3. 前程无忧 & 智联重写 | 0/? | Not started | - | | 4. 后端 & 外部脚本接入 | 0/? | Not started | - | @@ -131,4 +134,4 @@ --- *Roadmap created: 2026-03-21* -*Last updated: 2026-03-21 after initial creation* +*Last updated: 2026-03-21 — Phase 1 plans created (01-01, 01-02)* diff --git a/.planning/phases/01-shared-core/01-01-PLAN.md b/.planning/phases/01-shared-core/01-01-PLAN.md new file mode 100644 index 0000000..c1e27df --- /dev/null +++ b/.planning/phases/01-shared-core/01-01-PLAN.md @@ -0,0 +1,516 @@ +--- +phase: 01-shared-core +plan: 01 +type: execute +wave: 1 +depends_on: [] +files_modified: + - crawler_core/__init__.py + - crawler_core/http_client.py + - crawler_core/base.py + - crawler_core/boss/__init__.py + - crawler_core/qcwy/__init__.py + - crawler_core/zhilian/__init__.py + - crawler_core/pyproject.toml + - Pipfile +autonomous: true +requirements: + - ARCH-01 + - ARCH-02 + - QUAL-04 + - QUAL-05 + +must_haves: + truths: + - "`pip install -e ./crawler_core` succeeds without errors" + - "`from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient` imports cleanly" + - "HTTPClient retries failed requests up to 3 times with exponential backoff (minimum 10s wait)" + - "All HTTP errors are logged to stderr via stdlib logging with level, url, and error message" + - "Old spiderJobs/ and jobs_spider/ code is NOT modified — feature flag isolation holds" + artifacts: + - path: "crawler_core/pyproject.toml" + provides: "Package metadata for editable install" + contains: "name = \"crawler_core\"" + - path: "crawler_core/__init__.py" + provides: "Public API surface" + exports: ["BaseFetcher", "BaseSearcher", "ApiResult", "HTTPClient"] + - path: "crawler_core/http_client.py" + provides: "TLS-fingerprinted HTTP client with retry and logging" + exports: ["HTTPClient"] + - path: "crawler_core/base.py" + provides: "Template-method base classes" + exports: ["ApiResult", "BaseFetcher", "BaseSearcher", "parse_response"] + key_links: + - from: "crawler_core/__init__.py" + to: "crawler_core/http_client.py" + via: "from crawler_core.http_client import HTTPClient" + - from: "crawler_core/__init__.py" + to: "crawler_core/base.py" + via: "from crawler_core.base import BaseFetcher, BaseSearcher, ApiResult" + - from: "crawler_core/base.py" + to: "crawler_core/http_client.py" + via: "from crawler_core.http_client import HTTPClient" +--- + + +Create the crawler_core/ installable shared package with its core infrastructure: HTTP client with TLS fingerprint, retry logic, stdlib logging, and the BaseFetcher/BaseSearcher template-method base classes. + +Purpose: This is the foundation everything else depends on. Once installed with `pip install -e ./crawler_core`, Phase 2/3 platform rewrites can import from it instead of copying code. + +Output: A working Python package at crawler_core/ that installs cleanly and exposes BaseFetcher, BaseSearcher, ApiResult, and HTTPClient. + + + +@~/.claude/get-shit-done/workflows/execute-plan.md +@~/.claude/get-shit-done/templates/summary.md + + + +@.planning/PROJECT.md +@.planning/ROADMAP.md +@.planning/phases/01-shared-core/1-CONTEXT.md + + + + + +From spiderJobs/core/http_client.py: +```python +class HTTPClient: + def __init__(self, base_url, default_headers=None, proxy=None, + tunnel_proxy=None, proxy_pool=None, timeout=10): ... + def _new_session(self) -> requests.Session: ... + def _get_proxies(self) -> Optional[dict]: ... + def _merge_headers(self, extra=None) -> dict: ... + def post(self, path, body, headers=None) -> tuple[int, Any]: ... + def get(self, path, params=None, headers=None) -> tuple[int, Any]: ... +``` + +From spiderJobs/core/base.py: +```python +@dataclass +class ApiResult: + success: bool + status_code: int + data: Any = None + list: list[dict] = field(default_factory=list) + count: int = 0 + is_end_page: bool = True + error: Optional[str] = None + +def parse_response(http_code: int, raw: Any) -> ApiResult: ... + +class BaseFetcher: + ENDPOINT: str = "" + def __init__(self, http_client: HTTPClient): ... + def _build_params(self) -> dict: raise NotImplementedError + def _parse(self, http_code, raw) -> ApiResult: ... + def fetch(self) -> ApiResult: ... + +class BaseSearcher: + ENDPOINT: str = "" + def __init__(self, page_size=15, http_client=None): ... + def _build_params(self, page_index) -> dict: raise NotImplementedError + def _request(self, params) -> tuple[int, Any]: ... + def _parse(self, http_code, raw) -> ApiResult: ... + def search(self, page_index=1) -> ApiResult: ... + def load_all(self, max_pages=10, on_page=None) -> list[dict]: ... +``` + + + + + + + Task 1: Create crawler_core package scaffold and pyproject.toml + + - /Users/win/2025/AICoding/JobData/pyproject.toml (understand existing project config format) + - /Users/win/2025/AICoding/JobData/Pipfile (understand dependency structure to add entries) + - /Users/win/2025/AICoding/JobData/.planning/phases/01-shared-core/1-CONTEXT.md (decisions D-01 through D-04) + + + crawler_core/pyproject.toml + crawler_core/boss/__init__.py + crawler_core/qcwy/__init__.py + crawler_core/zhilian/__init__.py + Pipfile + + +Create the crawler_core/ directory structure and configure it as an installable Python package. + +**Step 1: Create crawler_core/pyproject.toml** + +```toml +[build-system] +requires = ["setuptools>=68"] +build-backend = "setuptools.backends.legacy:build" + +[project] +name = "crawler_core" +version = "0.1.0" +description = "Shared crawler core — sign algorithms, HTTP client, base classes" +requires-python = ">=3.11" +dependencies = [ + "requests_go==1.0.9", + "tenacity>=8.0", +] + +[tool.setuptools.packages.find] +where = [".."] +include = ["crawler_core*"] +``` + +NOTE: `where = [".."]` means setuptools finds the `crawler_core` package by looking one level up from the pyproject.toml, which is at the repo root. This makes `pip install -e ./crawler_core` resolve correctly. + +**Step 2: Create platform namespace __init__.py files (empty)** + +Create these three files with a single docstring only — NO imports, they are just namespace markers: +- `crawler_core/boss/__init__.py`: `"""Boss直聘 platform module."""` +- `crawler_core/qcwy/__init__.py`: `"""前程无忧 (51Job) platform module."""` +- `crawler_core/zhilian/__init__.py`: `"""智联招聘 platform module."""` + +**Step 3: Add dependencies to Pipfile** + +In the `[packages]` section (before `[dev-packages]`), add these two lines (after `playwright = "==1.57.0"`): + +``` +requests_go = "==1.0.9" +tenacity = ">=8.0" +``` + +In the `[dev-packages]` section, add: + +``` +pytest = ">=8.0" +pytest-cov = ">=4.0" +pytest-anyio = "*" +``` + +**What NOT to do:** +- Do NOT create a crawler_core/__init__.py in this task (Task 2 creates it) +- Do NOT create crawler_core/http_client.py or crawler_core/base.py (Task 2 and 3) +- Do NOT run `pip install` — just write the files + + + python -c "import tomllib; d=tomllib.load(open('/Users/win/2025/AICoding/JobData/crawler_core/pyproject.toml','rb')); assert d['project']['name']=='crawler_core'; print('pyproject.toml OK')" && grep -q "requests_go" /Users/win/2025/AICoding/JobData/Pipfile && grep -q "tenacity" /Users/win/2025/AICoding/JobData/Pipfile && grep -q "pytest" /Users/win/2025/AICoding/JobData/Pipfile && echo "Pipfile OK" + + + - `crawler_core/pyproject.toml` exists and contains `name = "crawler_core"`, `requires-python = ">=3.11"`, `requests_go==1.0.9`, `tenacity>=8.0` + - `crawler_core/boss/__init__.py`, `crawler_core/qcwy/__init__.py`, `crawler_core/zhilian/__init__.py` all exist (can be empty docstrings) + - `Pipfile` [packages] section contains `requests_go = "==1.0.9"` and `tenacity = ">=8.0"` + - `Pipfile` [dev-packages] section contains `pytest`, `pytest-cov`, `pytest-anyio` + - `grep -c "requests_go" /Users/win/2025/AICoding/JobData/Pipfile` outputs `1` (no duplicates) + + Package directory structure created, pyproject.toml valid, dependencies declared in Pipfile. + + + + Task 2: Create crawler_core/http_client.py with tenacity retry and logging + + - /Users/win/2025/AICoding/JobData/spiderJobs/core/http_client.py (source to port — read every line) + - /Users/win/2025/AICoding/JobData/.planning/research/STACK.md (tenacity config section, TLS fingerprint section) + - /Users/win/2025/AICoding/JobData/.planning/phases/01-shared-core/1-CONTEXT.md (D-03: no loguru, stdlib only; D-09: one HTTPClient class) + + + crawler_core/http_client.py + + +Port `spiderJobs/core/http_client.py` to `crawler_core/http_client.py` with two additions: tenacity retry and stdlib logging. + +**The file must be exactly `crawler_core/http_client.py`** — no subdirectory. + +**Imports to use (CRITICAL — per D-03, only requests_go + stdlib + tenacity):** +```python +from __future__ import annotations + +import logging +import random +from typing import Any, Optional + +import requests_go as requests +from requests_go.tls_config import TLS_CHROME_LATEST +from tenacity import ( + retry, + retry_if_exception_type, + stop_after_attempt, + wait_random_exponential, +) +``` + +**Logging setup (module-level, before the class):** +```python +logger = logging.getLogger("crawler_core.http_client") +``` + +This uses stdlib logging — NOT loguru (per D-03, loguru is excluded from crawler_core). Callers (app/services/crawler/) can configure loguru to bridge stdlib if desired. + +**Class structure:** Copy the full `HTTPClient` class from `spiderJobs/core/http_client.py` EXACTLY, then make these targeted changes: + +1. **Keep all existing methods unchanged:** `__init__`, `_new_session`, `_get_proxies`, `_merge_headers` + +2. **Wrap `post()` with tenacity retry decorator:** +```python +@retry( + stop=stop_after_attempt(3), + wait=wait_random_exponential(multiplier=1, min=10, max=30), + retry=retry_if_exception_type((ConnectionError, TimeoutError, OSError)), + reraise=True, + before_sleep=lambda retry_state: logger.warning( + "HTTP retry attempt=%d url=%s error=%s", + retry_state.attempt_number, + retry_state.args[1] if retry_state.args else "unknown", + retry_state.outcome.exception(), + ), +) +def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]: + """发送 POST 请求""" + # ... existing body unchanged ... + logger.debug("POST %s%s", self.base_url, path) + # existing try/finally logic unchanged +``` + +3. **Wrap `get()` with the same tenacity retry decorator** (identical decorator, same pattern): +```python +@retry( + stop=stop_after_attempt(3), + wait=wait_random_exponential(multiplier=1, min=10, max=30), + retry=retry_if_exception_type((ConnectionError, TimeoutError, OSError)), + reraise=True, + before_sleep=lambda retry_state: logger.warning( + "HTTP retry attempt=%d error=%s", + retry_state.attempt_number, + retry_state.outcome.exception(), + ), +) +def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]: + """发送 GET 请求""" + logger.debug("GET %s%s", self.base_url, path) + # ... existing body unchanged ... +``` + +4. **Add module docstring at the top:** +```python +""" +crawler_core.http_client — 通用 HTTP 客户端 + +基于 requests-go,自带 Chrome TLS 指纹伪装(TLS_CHROME_LATEST + random_ja3=True)。 +支持代理 IP / 隧道代理 / 代理池轮换。 +内置 tenacity 重试(3次,指数退避,最小10秒间隔)。 +使用 stdlib logging — 上层可通过 logging.getLogger('crawler_core') 配置。 + +不依赖 loguru / FastAPI / Tortoise-ORM 等应用框架。 +""" +``` + +**Minimum 10 second wait is MANDATORY** — `min=10` in `wait_random_exponential` preserves the anti-detection delay requirement from STACK.md. + +**Do NOT:** +- Change the proxy logic (keep tunnel_proxy / proxy_pool / fixed proxy logic identical) +- Import loguru +- Import anything from `spiderJobs.*` or `app.*` + + + cd /Users/win/2025/AICoding/JobData && python -c " +import sys +sys.path.insert(0, '.') +from crawler_core.http_client import HTTPClient +import inspect, logging +src = inspect.getsource(HTTPClient.post) +assert 'retry' in src or '@retry' in dir(HTTPClient.post), 'tenacity decorator missing on post' +assert 'logger' in src or 'logging' in src, 'logging missing in post' +print('HTTPClient OK') +" + + + - `crawler_core/http_client.py` exists and is importable: `from crawler_core.http_client import HTTPClient` succeeds (after adding crawler_core to sys.path) + - File contains `from tenacity import retry` in imports + - File contains `logger = logging.getLogger("crawler_core.http_client")` + - File contains `wait_random_exponential(multiplier=1, min=10, max=30)` — exact values + - File contains `stop_after_attempt(3)` + - File does NOT contain `import loguru` or `from loguru` anywhere + - File does NOT contain `from spiderJobs` or `from app` anywhere + - File is under 200 lines (source is 155 lines + ~30 lines of additions) + - `grep -c "from tenacity" /Users/win/2025/AICoding/JobData/crawler_core/http_client.py` outputs `1` + - `grep "min=10" /Users/win/2025/AICoding/JobData/crawler_core/http_client.py` has output + + HTTPClient ported with retry (3 attempts, min=10s wait) and stdlib logging. No loguru. No spiderJobs imports. + + + + Task 3: Create crawler_core/base.py and crawler_core/__init__.py + + - /Users/win/2025/AICoding/JobData/spiderJobs/core/base.py (source to port — read every line) + - /Users/win/2025/AICoding/JobData/.planning/research/ARCHITECTURE.md (abstract base class hierarchy section) + - /Users/win/2025/AICoding/JobData/.planning/phases/01-shared-core/1-CONTEXT.md (D-05, D-06, D-07: base class interface decisions) + + + crawler_core/base.py + crawler_core/__init__.py + + +Port `spiderJobs/core/base.py` to `crawler_core/base.py` and create the public `__init__.py`. + +**crawler_core/base.py:** + +Port the full file from `spiderJobs/core/base.py` with ONE import change: + +Change: +```python +from spiderJobs.core.http_client import HTTPClient +``` + +To: +```python +from crawler_core.http_client import HTTPClient +``` + +Everything else stays identical to `spiderJobs/core/base.py`: +- `ApiResult` dataclass with fields: `success`, `status_code`, `data`, `list`, `count`, `is_end_page`, `error` +- `parse_response(http_code, raw)` function +- `BaseFetcher` class with `ENDPOINT`, `__init__`, `_build_params`, `_parse`, `fetch` +- `BaseSearcher` class with `ENDPOINT`, `__init__`, `_build_params`, `_request`, `_parse`, `search`, `load_all` + +Add module docstring at the top: +```python +""" +crawler_core.base — 通用基类与数据结构 + +提供所有招聘平台共用的: ApiResult, BaseFetcher, BaseSearcher, parse_response +不依赖任何平台特定代码。 +""" +``` + +Replace the existing inline print in `load_all`: +```python +# Change this: +print(f"第 {page_index} 页失败: {result.error}") + +# To this (use stdlib logging, not print): +import logging as _logging +_log = _logging.getLogger("crawler_core.base") +_log.warning("第 %d 页失败: %s", page_index, result.error) +``` + +Actually, define the logger at module level (not inside the method): +```python +import logging +_logger = logging.getLogger("crawler_core.base") +``` + +Then in `load_all`, replace `print(...)` with `_logger.warning(...)`. + +**crawler_core/__init__.py:** + +```python +""" +crawler_core — 招聘爬虫共享核心包 + +安装方式: pip install -e ./crawler_core +使用方式: from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient +""" + +from crawler_core.base import ApiResult, BaseFetcher, BaseSearcher, parse_response +from crawler_core.http_client import HTTPClient + +__all__ = [ + "ApiResult", + "BaseFetcher", + "BaseSearcher", + "HTTPClient", + "parse_response", +] + +__version__ = "0.1.0" +``` + +**Do NOT:** +- Change the logic of `BaseFetcher.fetch()`, `BaseSearcher.search()`, or `BaseSearcher.load_all()` beyond the logger swap +- Import from `spiderJobs.*` or `app.*` +- Import loguru +- Add any platform-specific code to base.py or __init__.py + + + cd /Users/win/2025/AICoding/JobData && python -c " +import sys +sys.path.insert(0, '.') +from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient, parse_response +import dataclasses +fields = {f.name for f in dataclasses.fields(ApiResult)} +assert fields == {'success','status_code','data','list','count','is_end_page','error'}, f'ApiResult fields wrong: {fields}' +assert hasattr(BaseFetcher, 'fetch'), 'BaseFetcher.fetch missing' +assert hasattr(BaseSearcher, 'load_all'), 'BaseSearcher.load_all missing' +print('All imports OK, ApiResult fields OK') +" + + + - `from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient` succeeds (with repo root on sys.path) + - `crawler_core/base.py` does NOT contain `from spiderJobs` anywhere: `grep "from spiderJobs" crawler_core/base.py` returns empty + - `crawler_core/base.py` does NOT contain `print(` anywhere: `grep "print(" crawler_core/base.py` returns empty + - `crawler_core/__init__.py` contains `__all__` with all 5 exports + - `crawler_core/__init__.py` contains `__version__ = "0.1.0"` + - `ApiResult` dataclass has exactly 7 fields: success, status_code, data, list, count, is_end_page, error + - `BaseFetcher._build_params` raises `NotImplementedError` + - `BaseSearcher._build_params` raises `NotImplementedError` + + base.py ported (no spiderJobs imports, no print statements), __init__.py exposes clean public API. + + + + + +Run the full import chain to verify the package works end-to-end before moving to Plan 02: + +```bash +cd /Users/win/2025/AICoding/JobData +python -c " +import sys +sys.path.insert(0, '.') +from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient, parse_response + +# Verify ApiResult structure +r = ApiResult(success=True, status_code=200) +assert r.success and r.list == [] and r.error is None + +# Verify BaseFetcher requires _build_params +class TestFetcher(BaseFetcher): + ENDPOINT = '/test' + def _build_params(self): + return {'q': 'test'} + +# Verify parse_response with dict input +result = parse_response(200, {'statusCode': 200, 'data': {'list': [{'id': 1}], 'count': 1, 'isEndPage': False}}) +assert result.success +assert result.list == [{'id': 1}] +assert not result.is_end_page + +print('All verification checks passed') +" +``` + +Also confirm no cross-contamination: +```bash +grep -r "from spiderJobs" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found spiderJobs import" || echo "OK: no spiderJobs imports" +grep -r "from app" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found app import" || echo "OK: no app imports" +grep -r "loguru" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found loguru" || echo "OK: no loguru" +``` + + + +1. `python -c "from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient"` exits 0 (with repo root on sys.path) +2. `crawler_core/pyproject.toml` passes `python -c "import tomllib; tomllib.load(open('crawler_core/pyproject.toml','rb'))"` +3. `grep "requests_go" Pipfile` has output — dependency declared +4. `grep "tenacity" Pipfile` has output — dependency declared +5. `grep "pytest" Pipfile` has output — dev dependency declared +6. `grep -r "from spiderJobs" crawler_core/` has NO output +7. `grep -r "loguru" crawler_core/` has NO output +8. `grep "min=10" crawler_core/http_client.py` has output — anti-detection delay preserved +9. `spiderJobs/` and `jobs_spider/` directories are UNCHANGED (no files modified) + + + +After completion, create `.planning/phases/01-shared-core/01-01-SUMMARY.md` with: +- What was created (file list with line counts) +- Key decisions made (pyproject.toml structure, tenacity config values, logging approach) +- Interface contracts (the public exports from crawler_core/__init__.py) +- Any deviations from this plan and why + diff --git a/.planning/phases/01-shared-core/01-02-PLAN.md b/.planning/phases/01-shared-core/01-02-PLAN.md new file mode 100644 index 0000000..83eedc0 --- /dev/null +++ b/.planning/phases/01-shared-core/01-02-PLAN.md @@ -0,0 +1,663 @@ +--- +phase: 01-shared-core +plan: 02 +type: execute +wave: 2 +depends_on: + - 01-01 +files_modified: + - crawler_core/boss/sign.py + - crawler_core/qcwy/sign.py + - crawler_core/zhilian/sign.py + - tests/crawler_core/__init__.py + - tests/crawler_core/test_boss_sign.py + - tests/crawler_core/test_qcwy_sign.py + - tests/crawler_core/test_zhilian_sign.py +autonomous: true +requirements: + - QUAL-01 + +must_haves: + truths: + - "Three sign algorithm files exist under crawler_core/{boss,qcwy,zhilian}/sign.py" + - "`pytest tests/crawler_core/` exits 0 — all sign algorithm tests pass" + - "Sign tests require NO network access, NO tokens, NO mocks — pure function assertions" + - "BossSign.generate_traceid() returns a string matching the pattern M-W[0-9a-zA-Z]{22}" + - "Job51Sign.build_sign_path() returns a tuple of (str, str) — (url_path, sign_hex)" + - "ZhilianSign.sign_headers() returns a dict with key 'x-zp-device-id'" + - "Old spiderJobs/platforms/*/sign.py files are NOT deleted or modified" + artifacts: + - path: "crawler_core/boss/sign.py" + provides: "Boss traceid generation — pure functions, no I/O" + exports: ["BossSign"] + - path: "crawler_core/qcwy/sign.py" + provides: "51Job HMAC-SHA256 signing — pure functions, no I/O" + exports: ["Job51Sign", "SIGN_KEY"] + - path: "crawler_core/zhilian/sign.py" + provides: "Zhilian header/param signing — pure functions, no I/O" + exports: ["ZhilianSign"] + - path: "tests/crawler_core/test_boss_sign.py" + provides: "BossSign unit tests — 6+ test cases" + - path: "tests/crawler_core/test_qcwy_sign.py" + provides: "Job51Sign unit tests — 5+ test cases" + - path: "tests/crawler_core/test_zhilian_sign.py" + provides: "ZhilianSign unit tests — 5+ test cases" + key_links: + - from: "tests/crawler_core/test_boss_sign.py" + to: "crawler_core/boss/sign.py" + via: "from crawler_core.boss.sign import BossSign, _compute_checksum, _generate_uuid" + - from: "crawler_core/boss/sign.py" + to: "stdlib only" + via: "import random, time — no external deps" +--- + + +Migrate all three platform sign algorithm files into crawler_core/ and write comprehensive unit tests for each. Sign algorithms are pure functions — they are the highest-value, lowest-cost tests in the codebase. + +Purpose: QUAL-01 requires unit test coverage for core signing algorithms. Tests validate that the ported algorithms match the originals. Pure functions mean no HTTP mocking needed. + +Output: Three sign.py files in crawler_core/{boss,qcwy,zhilian}/ and three test files in tests/crawler_core/ — all tests pass with `pytest tests/crawler_core/`. + + + +@~/.claude/get-shit-done/workflows/execute-plan.md +@~/.claude/get-shit-done/templates/summary.md + + + +@.planning/phases/01-shared-core/01-01-SUMMARY.md +@.planning/phases/01-shared-core/1-CONTEXT.md + + + + +From spiderJobs/platforms/boss/sign.py: +```python +# Module-level constants: +_CHARS = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" + +# Module-level private functions: +def _to_u32(n: int) -> int: ... # Masks to 32-bit unsigned +def _compute_checksum(uuid_str: str) -> str: ... # 3-char checksum from 19-char uuid +def _generate_uuid() -> str: ... # 13-char hex timestamp + 6-char base62 random + +class BossSign: + def __init__(self, *, mpt: str = "", wt2: str = ""): ... + @staticmethod + def generate_traceid(prefix: str = "M-W") -> str: ... + # Result format: "{prefix}{19-char uuid}{3-char checksum}" e.g. "M-W0019d0a8af5f32gtVvnD4M" +``` + +From spiderJobs/platforms/job51/sign.py: +```python +SIGN_KEY = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b" + +class Job51Sign: + def __init__(self, *, sign_key: str = SIGN_KEY): ... + @staticmethod + def generate_uuid() -> str: ... # 13-char timestamp + 10-char random int + def build_sign_path( + self, + endpoint: str, + method: str = "GET", + params: dict | None = None, + body: dict | None = None, + ) -> tuple[str, str]: ... # Returns (url_path, sign_hex) + # url_path format: "/{endpoint}?api_key=51job×tamp={ts}[¶m=val...]" + # sign_hex: HMAC-SHA256 hex string, 64 chars +``` + +From spiderJobs/platforms/zhilian/sign.py: +```python +class ZhilianSign: + def __init__(self, *, at="", rt="", device_id=None, version="4.1.259", + channel="wxxiaochengxu", platform="12"): ... + @staticmethod + def generate_uuid() -> str: ... # UUID4-format with uppercase hex + def sign_headers(self, page_code: str = "0") -> dict: ... + # Returns dict with keys: x-zp-at, x-zp-rt, x-zp-action-id, x-zp-page-code, + # x-zp-version, x-zp-channel, x-zp-platform, x-zp-device-id, x-zp-business-system + def sign_params(self) -> dict: ... + # Returns dict with keys: at, rt, channel, platform, version, d +``` + + + + + + + Task 1: Port sign algorithms to crawler_core/ platform directories + + - /Users/win/2025/AICoding/JobData/spiderJobs/platforms/boss/sign.py (source — read every line before writing) + - /Users/win/2025/AICoding/JobData/spiderJobs/platforms/job51/sign.py (source — read every line before writing) + - /Users/win/2025/AICoding/JobData/spiderJobs/platforms/zhilian/sign.py (source — read every line before writing) + - /Users/win/2025/AICoding/JobData/crawler_core/boss/__init__.py (confirm it exists from Plan 01) + + + crawler_core/boss/sign.py + crawler_core/qcwy/sign.py + crawler_core/zhilian/sign.py + + + - BossSign.generate_traceid("M-W") returns a 25-char string starting with "M-W" + - BossSign.generate_traceid("M-W") result matches regex r'^M-W[0-9a-f]{13}[0-9a-zA-Z]{6}[0-9a-zA-Z]{3}$' + - _compute_checksum produces exactly 3 characters from the _CHARS set + - _generate_uuid produces exactly 19 characters (13 hex + 6 base62) + - Job51Sign().build_sign_path("open/test", "GET") returns tuple of length 2 + - Job51Sign().build_sign_path("open/test", "GET")[0] starts with "/open/test?api_key=51job×tamp=" + - Job51Sign().build_sign_path("open/test", "GET")[1] is 64-char hex string (HMAC-SHA256) + - Job51Sign().build_sign_path("open/test", "POST", body={"k": "v"})[1] != Job51Sign().build_sign_path("open/test", "GET")[1] — method affects signature + - ZhilianSign().sign_headers() returns dict with exactly 9 keys + - ZhilianSign().sign_headers()["x-zp-business-system"] == "73" + - ZhilianSign().sign_params() returns dict with exactly 6 keys: at, rt, channel, platform, version, d + - ZhilianSign(at="tok123").sign_params()["at"] == "tok123" + + +Copy the three sign algorithm files to their new locations under crawler_core/, making only one change per file: update the module docstring to reference crawler_core. + +**crawler_core/boss/sign.py:** +Copy `spiderJobs/platforms/boss/sign.py` verbatim. Update the module docstring first line to: +``` +Boss直聘 Traceid 生成算法 (crawler_core) +``` +No other changes. ALL private functions (_to_u32, _compute_checksum, _generate_uuid) and the BossSign class stay exactly as-is. + +**crawler_core/qcwy/sign.py:** +Copy `spiderJobs/platforms/job51/sign.py` verbatim. Update the module docstring first line to: +``` +前程无忧 (51Job) 签名算法 (crawler_core) +``` +No other changes. SIGN_KEY constant and Job51Sign class stay exactly as-is. The `import json` inside `build_sign_path` stays inside the method (do not hoist it). + +**crawler_core/zhilian/sign.py:** +Copy `spiderJobs/platforms/zhilian/sign.py` verbatim. Update the module docstring first line to: +``` +智联招聘签名算法 (crawler_core) +``` +No other changes. ZhilianSign class stays exactly as-is. + +**Verification after writing each file:** +- No file imports from `spiderJobs.*` +- No file imports from `app.*` +- No file uses `requests` or any HTTP library +- Each file is entirely self-contained with only stdlib imports (random, time, math, hmac, hashlib, urllib.parse) + +**DO NOT delete or modify:** +- `spiderJobs/platforms/boss/sign.py` +- `spiderJobs/platforms/job51/sign.py` +- `spiderJobs/platforms/zhilian/sign.py` + +These old files remain in place until Phase 4 cleanup. + + + cd /Users/win/2025/AICoding/JobData && python -c " +import sys +sys.path.insert(0, '.') +import re + +from crawler_core.boss.sign import BossSign, _compute_checksum, _generate_uuid +from crawler_core.qcwy.sign import Job51Sign, SIGN_KEY +from crawler_core.zhilian.sign import ZhilianSign + +# Boss +tid = BossSign.generate_traceid() +assert re.match(r'^M-W[0-9a-f]{13}[0-9a-zA-Z]{6}[0-9a-zA-Z]{3}$', tid), f'Traceid format wrong: {tid}' +print(f'BossSign OK: {tid}') + +# Job51 +signer = Job51Sign() +path, sign = signer.build_sign_path('open/test', 'GET', params={'key': 'val'}) +assert path.startswith('/open/test?api_key=51job×tamp='), f'Path format wrong: {path}' +assert len(sign) == 64, f'Sign length wrong: {len(sign)}' +print(f'Job51Sign OK: sign={sign[:8]}...') + +# Zhilian +zs = ZhilianSign(at='token123', rt='refresh456') +headers = zs.sign_headers() +assert 'x-zp-device-id' in headers, 'device-id missing' +assert headers['x-zp-business-system'] == '73', 'business-system wrong' +assert len(headers) == 9, f'Expected 9 header keys, got {len(headers)}' +params = zs.sign_params() +assert params['at'] == 'token123', f'at field wrong: {params[\"at\"]}' +assert len(params) == 6, f'Expected 6 param keys, got {len(params)}' +print('ZhilianSign OK') +print('All sign algorithms imported and validated') +" + + + - `crawler_core/boss/sign.py` exists: `ls crawler_core/boss/sign.py` exits 0 + - `crawler_core/qcwy/sign.py` exists: `ls crawler_core/qcwy/sign.py` exits 0 + - `crawler_core/zhilian/sign.py` exists: `ls crawler_core/zhilian/sign.py` exits 0 + - `grep -r "from spiderJobs" crawler_core/` returns empty (no cross-imports) + - `grep -r "import requests" crawler_core/boss/sign.py crawler_core/qcwy/sign.py crawler_core/zhilian/sign.py` returns empty (no HTTP imports in sign files) + - `BossSign.generate_traceid()` returns a string of length 25 (3-char prefix + 19-char uuid + 3-char checksum) + - `Job51Sign().build_sign_path("test", "GET")` returns tuple where `[1]` is 64-char string + - `ZhilianSign().sign_headers()` returns dict with `x-zp-business-system` == `"73"` + - Old files UNCHANGED: `diff spiderJobs/platforms/boss/sign.py crawler_core/boss/sign.py` shows only docstring difference + + Three sign.py files in crawler_core/ — pure functions, no HTTP, no cross-imports from app or spiderJobs. + + + + Task 2: Write sign algorithm unit tests + + - /Users/win/2025/AICoding/JobData/crawler_core/boss/sign.py (just created — read to understand exact exports) + - /Users/win/2025/AICoding/JobData/crawler_core/qcwy/sign.py (just created) + - /Users/win/2025/AICoding/JobData/crawler_core/zhilian/sign.py (just created) + - /Users/win/2025/AICoding/JobData/.planning/research/STACK.md (testing section: "Sign algorithm tests — Pure functions, no HTTP, no mocks needed") + - /Users/win/2025/AICoding/JobData/tests/ (check what already exists to avoid conflicts) + + + tests/crawler_core/__init__.py + tests/crawler_core/test_boss_sign.py + tests/crawler_core/test_qcwy_sign.py + tests/crawler_core/test_zhilian_sign.py + + + Boss sign tests: + - test_traceid_format: generate_traceid() matches regex r'^M-W[0-9a-f]{13}[0-9a-zA-Z]{6}[0-9a-zA-Z]{3}$' + - test_traceid_length: generate_traceid() is exactly 25 chars + - test_traceid_custom_prefix: generate_traceid("X-Y") starts with "X-Y" + - test_traceid_uniqueness: two calls return different values + - test_compute_checksum_length: _compute_checksum(any 19-char string) returns 3 chars + - test_compute_checksum_chars: all 3 chars are in _CHARS (base62) + - test_compute_checksum_deterministic: same input → same output + - test_generate_uuid_length: _generate_uuid() returns 19 chars + + Job51 sign tests: + - test_build_sign_path_get_format: GET path starts with "/{endpoint}?api_key=51job×tamp=" + - test_build_sign_path_returns_tuple: returns tuple of (str, str) + - test_sign_hex_length: sign is 64-char hex string matching r'^[0-9a-f]{64}$' + - test_get_vs_post_different_sign: same endpoint, different method → different sign + - test_sign_with_params_includes_params_in_path: GET with params={'k':'v'} → path contains "k=v" + - test_sign_key_in_path: path contains "api_key=51job" + - test_generate_uuid_length: generate_uuid() returns string of length 23 (13+10) + + Zhilian sign tests: + - test_sign_headers_keys: returns dict with exactly these 9 keys: x-zp-at, x-zp-rt, x-zp-action-id, x-zp-page-code, x-zp-version, x-zp-channel, x-zp-platform, x-zp-device-id, x-zp-business-system + - test_sign_headers_business_system: x-zp-business-system == "73" + - test_sign_headers_tokens: x-zp-at and x-zp-rt reflect constructor args + - test_sign_params_keys: returns dict with exactly these 6 keys: at, rt, channel, platform, version, d + - test_sign_params_device_id_matches: d == device_id from constructor + - test_generate_uuid_format: matches UUID4 pattern r'^[0-9A-F]{8}-[0-9A-F]{4}-4[0-9A-F]{3}-[89AB][0-9A-F]{3}-[0-9A-F]{12}$' + - test_action_id_unique_per_call: two sign_headers() calls produce different x-zp-action-id values + + +Create `tests/crawler_core/__init__.py` (empty file) and three test files. + +**tests/crawler_core/__init__.py:** Empty file (just creates the package). + +**Important: Add sys.path setup to each test file** since crawler_core is not yet pip-installed: + +```python +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) +``` + +Place this at the TOP of each test file before any crawler_core imports. + +--- + +**tests/crawler_core/test_boss_sign.py:** + +```python +"""Unit tests for crawler_core.boss.sign — BossSign and helper functions. + +All tests are pure function assertions: no HTTP, no network, no mocks. +""" +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + +import re +import pytest +from crawler_core.boss.sign import BossSign, _compute_checksum, _generate_uuid, _CHARS + + +class TestBossSignGenerateTraceid: + def test_traceid_format(self): + tid = BossSign.generate_traceid() + assert re.match(r'^M-W[0-9a-f]{13}[0-9a-zA-Z]{6}[0-9a-zA-Z]{3}$', tid), \ + f"Traceid format wrong: {tid}" + + def test_traceid_length(self): + tid = BossSign.generate_traceid() + assert len(tid) == 25, f"Expected 25 chars, got {len(tid)}: {tid}" + + def test_traceid_custom_prefix(self): + tid = BossSign.generate_traceid(prefix="X-Y") + assert tid.startswith("X-Y"), f"Expected X-Y prefix, got: {tid}" + + def test_traceid_uniqueness(self): + t1 = BossSign.generate_traceid() + t2 = BossSign.generate_traceid() + assert t1 != t2, "Two calls should return different traceids" + + def test_bosssign_init_defaults(self): + sign = BossSign() + assert sign.mpt == "" + assert sign.wt2 == "" + + def test_bosssign_init_with_tokens(self): + sign = BossSign(mpt="mpt_token", wt2="wt2_token") + assert sign.mpt == "mpt_token" + assert sign.wt2 == "wt2_token" + + +class TestComputeChecksum: + def test_checksum_length(self): + checksum = _compute_checksum("1234567890abc456789") # 19 chars + assert len(checksum) == 3, f"Expected 3 chars, got {len(checksum)}" + + def test_checksum_chars_in_base62(self): + checksum = _compute_checksum("1234567890abc456789") + for ch in checksum: + assert ch in _CHARS, f"Char {ch!r} not in base62 set" + + def test_checksum_deterministic(self): + uuid_str = "1234567890abc456789" + c1 = _compute_checksum(uuid_str) + c2 = _compute_checksum(uuid_str) + assert c1 == c2, "Same input must produce same checksum" + + def test_checksum_differs_for_different_input(self): + # Different inputs should (almost always) produce different checksums + c1 = _compute_checksum("1234567890abc456789") + c2 = _compute_checksum("9876543210xyz456789") + # Not guaranteed to differ but extremely likely + # We test at least that they are valid 3-char strings + assert len(c1) == 3 and len(c2) == 3 + + +class TestGenerateUuid: + def test_generate_uuid_length(self): + uuid = _generate_uuid() + assert len(uuid) == 19, f"Expected 19 chars, got {len(uuid)}: {uuid}" + + def test_generate_uuid_hex_prefix(self): + uuid = _generate_uuid() + hex_part = uuid[:13] + assert re.match(r'^[0-9a-f]{13}$', hex_part), \ + f"First 13 chars should be hex: {hex_part}" + + def test_generate_uuid_base62_suffix(self): + uuid = _generate_uuid() + rand_part = uuid[13:] + for ch in rand_part: + assert ch in _CHARS, f"Char {ch!r} in random suffix not in base62" +``` + +--- + +**tests/crawler_core/test_qcwy_sign.py:** + +```python +"""Unit tests for crawler_core.qcwy.sign — Job51Sign. + +All tests are pure function assertions: no HTTP, no network, no mocks. +""" +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + +import re +import pytest +from crawler_core.qcwy.sign import Job51Sign, SIGN_KEY + + +class TestJob51SignInit: + def test_default_sign_key(self): + signer = Job51Sign() + assert signer.sign_key == SIGN_KEY + assert len(SIGN_KEY) == 64 # 64-char hex key + + def test_custom_sign_key(self): + custom_key = "a" * 64 + signer = Job51Sign(sign_key=custom_key) + assert signer.sign_key == custom_key + + +class TestJob51SignBuildSignPath: + def setup_method(self): + self.signer = Job51Sign() + + def test_returns_tuple_of_two_strings(self): + result = self.signer.build_sign_path("open/test") + assert isinstance(result, tuple) + assert len(result) == 2 + assert all(isinstance(s, str) for s in result) + + def test_get_path_format(self): + path, sign = self.signer.build_sign_path("open/test", "GET") + assert path.startswith("/open/test?api_key=51job×tamp="), \ + f"Path format wrong: {path}" + + def test_sign_hex_length(self): + _, sign = self.signer.build_sign_path("open/test") + assert len(sign) == 64, f"Sign should be 64-char hex, got {len(sign)}: {sign}" + + def test_sign_hex_format(self): + _, sign = self.signer.build_sign_path("open/test") + assert re.match(r'^[0-9a-f]{64}$', sign), f"Sign not hex: {sign}" + + def test_get_vs_post_different_sign(self): + _, get_sign = self.signer.build_sign_path("open/test", "GET") + _, post_sign = self.signer.build_sign_path("open/test", "POST", body={"k": "v"}) + assert get_sign != post_sign, "GET and POST should produce different signatures" + + def test_get_with_params_includes_params_in_path(self): + path, _ = self.signer.build_sign_path("open/test", "GET", params={"city": "shanghai"}) + assert "city" in path and "shanghai" in path, \ + f"Params should appear in path: {path}" + + def test_sign_key_in_path(self): + path, _ = self.signer.build_sign_path("open/jobs") + assert "api_key=51job" in path, f"api_key=51job missing from path: {path}" + + +class TestJob51SignGenerateUuid: + def test_generate_uuid_is_string(self): + uuid = Job51Sign.generate_uuid() + assert isinstance(uuid, str) + + def test_generate_uuid_length(self): + uuid = Job51Sign.generate_uuid() + # 13-char ms timestamp + 10-char random int = 23 chars + assert len(uuid) == 23, f"Expected 23 chars, got {len(uuid)}: {uuid}" + + def test_generate_uuid_numeric(self): + uuid = Job51Sign.generate_uuid() + assert uuid.isdigit(), f"UUID should be all digits: {uuid}" +``` + +--- + +**tests/crawler_core/test_zhilian_sign.py:** + +```python +"""Unit tests for crawler_core.zhilian.sign — ZhilianSign. + +All tests are pure function assertions: no HTTP, no network, no mocks. +""" +import sys +import os +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..', '..')) + +import re +import pytest +from crawler_core.zhilian.sign import ZhilianSign + +EXPECTED_HEADER_KEYS = { + "x-zp-at", "x-zp-rt", "x-zp-action-id", "x-zp-page-code", + "x-zp-version", "x-zp-channel", "x-zp-platform", "x-zp-device-id", + "x-zp-business-system", +} + +EXPECTED_PARAM_KEYS = {"at", "rt", "channel", "platform", "version", "d"} + + +class TestZhilianSignInit: + def test_defaults(self): + sign = ZhilianSign() + assert sign.at == "" + assert sign.rt == "" + assert sign.version == "4.1.259" + assert sign.channel == "wxxiaochengxu" + assert sign.platform == "12" + assert sign.device_id # auto-generated, not empty + + def test_custom_tokens(self): + sign = ZhilianSign(at="at_token", rt="rt_token") + assert sign.at == "at_token" + assert sign.rt == "rt_token" + + def test_custom_device_id(self): + sign = ZhilianSign(device_id="CUSTOM-DEVICE-ID") + assert sign.device_id == "CUSTOM-DEVICE-ID" + + def test_auto_device_id_is_uuid4_format(self): + sign = ZhilianSign() + uuid_pattern = r'^[0-9A-F]{8}-[0-9A-F]{4}-4[0-9A-F]{3}-[89AB][0-9A-F]{3}-[0-9A-F]{12}$' + assert re.match(uuid_pattern, sign.device_id), \ + f"device_id not UUID4 format: {sign.device_id}" + + +class TestZhilianSignHeaders: + def setup_method(self): + self.sign = ZhilianSign(at="at123", rt="rt456") + + def test_keys_exactly_nine(self): + headers = self.sign.sign_headers() + assert set(headers.keys()) == EXPECTED_HEADER_KEYS, \ + f"Header keys wrong: {set(headers.keys())}" + + def test_business_system_is_73(self): + headers = self.sign.sign_headers() + assert headers["x-zp-business-system"] == "73" + + def test_tokens_reflected(self): + headers = self.sign.sign_headers() + assert headers["x-zp-at"] == "at123" + assert headers["x-zp-rt"] == "rt456" + + def test_action_id_is_uuid4_format(self): + headers = self.sign.sign_headers() + action_id = headers["x-zp-action-id"] + uuid_pattern = r'^[0-9A-F]{8}-[0-9A-F]{4}-4[0-9A-F]{3}-[89AB][0-9A-F]{3}-[0-9A-F]{12}$' + assert re.match(uuid_pattern, action_id), \ + f"action_id not UUID4 format: {action_id}" + + def test_action_id_unique_per_call(self): + h1 = self.sign.sign_headers() + h2 = self.sign.sign_headers() + assert h1["x-zp-action-id"] != h2["x-zp-action-id"], \ + "action_id must be freshly generated on each call" + + def test_device_id_in_headers(self): + headers = self.sign.sign_headers() + assert headers["x-zp-device-id"] == self.sign.device_id + + +class TestZhilianSignParams: + def setup_method(self): + self.sign = ZhilianSign(at="at789", rt="rt012", device_id="DEV-ID") + + def test_keys_exactly_six(self): + params = self.sign.sign_params() + assert set(params.keys()) == EXPECTED_PARAM_KEYS, \ + f"Param keys wrong: {set(params.keys())}" + + def test_device_id_as_d(self): + params = self.sign.sign_params() + assert params["d"] == "DEV-ID" + + def test_tokens_reflected(self): + params = self.sign.sign_params() + assert params["at"] == "at789" + assert params["rt"] == "rt012" + + +class TestZhilianGenerateUuid: + def test_uuid4_format(self): + uuid = ZhilianSign.generate_uuid() + uuid_pattern = r'^[0-9A-F]{8}-[0-9A-F]{4}-4[0-9A-F]{3}-[89AB][0-9A-F]{3}-[0-9A-F]{12}$' + assert re.match(uuid_pattern, uuid), \ + f"UUID not UUID4 format: {uuid}" + + def test_uuid_length(self): + uuid = ZhilianSign.generate_uuid() + assert len(uuid) == 36, f"Expected 36 chars, got {len(uuid)}" + + def test_uuid_version_4(self): + uuid = ZhilianSign.generate_uuid() + assert uuid[14] == "4", f"Version digit should be 4, got: {uuid[14]}" +``` + +After creating all files, run the tests to verify they pass. If any test fails due to a mismatch between the test expectation and the actual sign algorithm behavior, fix the TEST (not the sign algorithm) to match the actual behavior — the sign algorithms are the ground truth. + + + cd /Users/win/2025/AICoding/JobData && python -m pytest tests/crawler_core/ -v --tb=short 2>&1 | tail -30 + + + - `tests/crawler_core/__init__.py` exists (even if empty) + - `tests/crawler_core/test_boss_sign.py` exists with at least 8 test functions + - `tests/crawler_core/test_qcwy_sign.py` exists with at least 7 test functions + - `tests/crawler_core/test_zhilian_sign.py` exists with at least 9 test functions + - `python -m pytest tests/crawler_core/ -v` exits 0 — ALL tests pass + - `python -m pytest tests/crawler_core/ -v` output contains "passed" and zero "failed" or "error" + - No test in any file makes HTTP requests or reads from files (pure function tests only) + - `grep -r "requests\|httpx\|mock\|patch" tests/crawler_core/` returns empty (no mocking needed) + - Test count: `python -m pytest tests/crawler_core/ --collect-only -q` reports at least 24 tests collected + + Three test files cover all sign algorithm edge cases. `pytest tests/crawler_core/` exits 0. No network access required. + + + + + +Run the complete verification suite to confirm Phase 1 is done: + +```bash +cd /Users/win/2025/AICoding/JobData + +# 1. All sign algorithms importable from crawler_core +python -c " +import sys +sys.path.insert(0, '.') +from crawler_core.boss.sign import BossSign +from crawler_core.qcwy.sign import Job51Sign +from crawler_core.zhilian.sign import ZhilianSign +print('All sign imports OK') +" + +# 2. All tests pass +python -m pytest tests/crawler_core/ -v + +# 3. No cross-contamination +grep -r "from spiderJobs" crawler_core/ && echo "FAIL" || echo "OK: no spiderJobs imports" +grep -r "import requests" crawler_core/boss/sign.py crawler_core/qcwy/sign.py crawler_core/zhilian/sign.py && echo "FAIL" || echo "OK: sign files have no HTTP imports" + +# 4. Old files untouched +git diff --name-only spiderJobs/platforms/ && echo "FAIL: spiderJobs modified" || echo "OK: spiderJobs untouched" +``` + + + +1. `python -m pytest tests/crawler_core/ -v` exits 0 with at least 24 tests collected and passing +2. `from crawler_core.boss.sign import BossSign` succeeds (with repo root on sys.path) +3. `from crawler_core.qcwy.sign import Job51Sign` succeeds +4. `from crawler_core.zhilian.sign import ZhilianSign` succeeds +5. `grep -r "from spiderJobs" crawler_core/` returns empty +6. `grep -r "import requests" crawler_core/boss/sign.py crawler_core/qcwy/sign.py crawler_core/zhilian/sign.py` returns empty +7. `git diff --name-only spiderJobs/` returns empty — old files untouched +8. All three sign.py files in crawler_core/ are under 120 lines each (no bloat) + + + +After completion, create `.planning/phases/01-shared-core/01-02-SUMMARY.md` with: +- List of files created (sign.py files + test files with line counts) +- Test counts per file and total +- Any behavioral differences discovered between spiderJobs/ originals and crawler_core/ copies +- Confirmation that old spiderJobs/ files were not modified +