docs(phase-1): create plans (2 plans, 2 waves) with checker revision
This commit is contained in:
parent
b27686a409
commit
fe9a6d1403
@ -23,9 +23,9 @@ requirements:
|
|||||||
must_haves:
|
must_haves:
|
||||||
truths:
|
truths:
|
||||||
- "`pip install -e ./crawler_core` succeeds without errors"
|
- "`pip install -e ./crawler_core` succeeds without errors"
|
||||||
- "`from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient` imports cleanly"
|
- "`from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient` imports cleanly"
|
||||||
- "HTTPClient retries failed requests up to 3 times with exponential backoff (minimum 10s wait)"
|
- "HTTPClient retries failed requests up to 3 times with exponential backoff (minimum 10s wait)"
|
||||||
- "All HTTP errors are logged to stderr via stdlib logging with level, url, and error message"
|
- "All HTTP errors are logged to stderr via stdlib logging.getLogger('crawler_core.*') in place; loguru bridge deferred to Phase 5"
|
||||||
- "Old spiderJobs/ and jobs_spider/ code is NOT modified — feature flag isolation holds"
|
- "Old spiderJobs/ and jobs_spider/ code is NOT modified — feature flag isolation holds"
|
||||||
artifacts:
|
artifacts:
|
||||||
- path: "crawler_core/pyproject.toml"
|
- path: "crawler_core/pyproject.toml"
|
||||||
@ -33,31 +33,32 @@ must_haves:
|
|||||||
contains: "name = \"crawler_core\""
|
contains: "name = \"crawler_core\""
|
||||||
- path: "crawler_core/__init__.py"
|
- path: "crawler_core/__init__.py"
|
||||||
provides: "Public API surface"
|
provides: "Public API surface"
|
||||||
exports: ["BaseFetcher", "BaseSearcher", "ApiResult", "HTTPClient"]
|
exports: ["BaseFetcher", "BaseSearcher", "Result", "HTTPClient"]
|
||||||
- path: "crawler_core/http_client.py"
|
- path: "crawler_core/http_client.py"
|
||||||
provides: "TLS-fingerprinted HTTP client with retry and logging"
|
provides: "TLS-fingerprinted HTTP client with retry and logging"
|
||||||
exports: ["HTTPClient"]
|
exports: ["HTTPClient"]
|
||||||
- path: "crawler_core/base.py"
|
- path: "crawler_core/base.py"
|
||||||
provides: "Template-method base classes"
|
provides: "Template-method base classes with generic Result[T] return type"
|
||||||
exports: ["ApiResult", "BaseFetcher", "BaseSearcher", "parse_response"]
|
exports: ["Result", "BaseFetcher", "BaseSearcher", "parse_response"]
|
||||||
key_links:
|
key_links:
|
||||||
- from: "crawler_core/__init__.py"
|
- from: "crawler_core/__init__.py"
|
||||||
to: "crawler_core/http_client.py"
|
to: "crawler_core/http_client.py"
|
||||||
via: "from crawler_core.http_client import HTTPClient"
|
via: "from crawler_core.http_client import HTTPClient"
|
||||||
- from: "crawler_core/__init__.py"
|
- from: "crawler_core/__init__.py"
|
||||||
to: "crawler_core/base.py"
|
to: "crawler_core/base.py"
|
||||||
via: "from crawler_core.base import BaseFetcher, BaseSearcher, ApiResult"
|
via: "from crawler_core.base import BaseFetcher, BaseSearcher, Result"
|
||||||
- from: "crawler_core/base.py"
|
- from: "crawler_core/base.py"
|
||||||
to: "crawler_core/http_client.py"
|
to: "crawler_core/http_client.py"
|
||||||
via: "from crawler_core.http_client import HTTPClient"
|
via: "from crawler_core.http_client import HTTPClient"
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|
||||||
<objective>
|
<objective>
|
||||||
Create the crawler_core/ installable shared package with its core infrastructure: HTTP client with TLS fingerprint, retry logic, stdlib logging, and the BaseFetcher/BaseSearcher template-method base classes.
|
Create the crawler_core/ installable shared package with its core infrastructure: HTTP client with TLS fingerprint, retry logic, stdlib logging, and the BaseFetcher/BaseSearcher template-method base classes.
|
||||||
|
|
||||||
Purpose: This is the foundation everything else depends on. Once installed with `pip install -e ./crawler_core`, Phase 2/3 platform rewrites can import from it instead of copying code.
|
Purpose: This is the foundation everything else depends on. Once installed with `pip install -e ./crawler_core`, Phase 2/3 platform rewrites can import from it instead of copying code.
|
||||||
|
|
||||||
Output: A working Python package at crawler_core/ that installs cleanly and exposes BaseFetcher, BaseSearcher, ApiResult, and HTTPClient.
|
Output: A working Python package at crawler_core/ that installs cleanly and exposes BaseFetcher, BaseSearcher, Result[T], and HTTPClient.
|
||||||
</objective>
|
</objective>
|
||||||
|
|
||||||
<execution_context>
|
<execution_context>
|
||||||
@ -72,7 +73,7 @@ Output: A working Python package at crawler_core/ that installs cleanly and expo
|
|||||||
|
|
||||||
<interfaces>
|
<interfaces>
|
||||||
<!-- Key existing code the executor must understand before creating crawler_core/ equivalents. -->
|
<!-- Key existing code the executor must understand before creating crawler_core/ equivalents. -->
|
||||||
<!-- DO NOT copy these verbatim — update the internal import paths. -->
|
<!-- DO NOT copy these verbatim — update the internal import paths and replace ApiResult with Result[T]. -->
|
||||||
|
|
||||||
From spiderJobs/core/http_client.py:
|
From spiderJobs/core/http_client.py:
|
||||||
```python
|
```python
|
||||||
@ -86,10 +87,10 @@ class HTTPClient:
|
|||||||
def get(self, path, params=None, headers=None) -> tuple[int, Any]: ...
|
def get(self, path, params=None, headers=None) -> tuple[int, Any]: ...
|
||||||
```
|
```
|
||||||
|
|
||||||
From spiderJobs/core/base.py:
|
From spiderJobs/core/base.py (reference only — DO NOT copy ApiResult; use Result[T] instead per D-07):
|
||||||
```python
|
```python
|
||||||
@dataclass
|
@dataclass
|
||||||
class ApiResult:
|
class ApiResult: # <-- OLD: replaced by Result[T] in crawler_core/base.py
|
||||||
success: bool
|
success: bool
|
||||||
status_code: int
|
status_code: int
|
||||||
data: Any = None
|
data: Any = None
|
||||||
@ -103,14 +104,14 @@ def parse_response(http_code: int, raw: Any) -> ApiResult: ...
|
|||||||
class BaseFetcher:
|
class BaseFetcher:
|
||||||
ENDPOINT: str = ""
|
ENDPOINT: str = ""
|
||||||
def __init__(self, http_client: HTTPClient): ...
|
def __init__(self, http_client: HTTPClient): ...
|
||||||
def _build_params(self) -> dict: raise NotImplementedError
|
def _build_params(self) -> dict: raise NotImplementedError # template method (required)
|
||||||
def _parse(self, http_code, raw) -> ApiResult: ...
|
def _parse(self, http_code, raw) -> ApiResult: ...
|
||||||
def fetch(self) -> ApiResult: ...
|
def fetch(self) -> ApiResult: ...
|
||||||
|
|
||||||
class BaseSearcher:
|
class BaseSearcher:
|
||||||
ENDPOINT: str = ""
|
ENDPOINT: str = ""
|
||||||
def __init__(self, page_size=15, http_client=None): ...
|
def __init__(self, page_size=15, http_client=None): ...
|
||||||
def _build_params(self, page_index) -> dict: raise NotImplementedError
|
def _build_params(self, page_index) -> dict: raise NotImplementedError # template method (required)
|
||||||
def _request(self, params) -> tuple[int, Any]: ...
|
def _request(self, params) -> tuple[int, Any]: ...
|
||||||
def _parse(self, http_code, raw) -> ApiResult: ...
|
def _parse(self, http_code, raw) -> ApiResult: ...
|
||||||
def search(self, page_index=1) -> ApiResult: ...
|
def search(self, page_index=1) -> ApiResult: ...
|
||||||
@ -337,7 +338,7 @@ print('HTTPClient OK')
|
|||||||
</task>
|
</task>
|
||||||
|
|
||||||
<task type="auto">
|
<task type="auto">
|
||||||
<name>Task 3: Create crawler_core/base.py and crawler_core/__init__.py</name>
|
<name>Task 3: Create crawler_core/base.py with Result[T] and 4 template methods, plus crawler_core/__init__.py</name>
|
||||||
<read_first>
|
<read_first>
|
||||||
- /Users/win/2025/AICoding/JobData/spiderJobs/core/base.py (source to port — read every line)
|
- /Users/win/2025/AICoding/JobData/spiderJobs/core/base.py (source to port — read every line)
|
||||||
- /Users/win/2025/AICoding/JobData/.planning/research/ARCHITECTURE.md (abstract base class hierarchy section)
|
- /Users/win/2025/AICoding/JobData/.planning/research/ARCHITECTURE.md (abstract base class hierarchy section)
|
||||||
@ -348,56 +349,186 @@ print('HTTPClient OK')
|
|||||||
crawler_core/__init__.py
|
crawler_core/__init__.py
|
||||||
</files>
|
</files>
|
||||||
<action>
|
<action>
|
||||||
Port `spiderJobs/core/base.py` to `crawler_core/base.py` and create the public `__init__.py`.
|
Create `crawler_core/base.py` with the new `Result[T]` generic dataclass (replacing ApiResult per D-07) and four template methods (per D-06), then create the public `__init__.py`.
|
||||||
|
|
||||||
**crawler_core/base.py:**
|
**crawler_core/base.py:**
|
||||||
|
|
||||||
Port the full file from `spiderJobs/core/base.py` with ONE import change:
|
|
||||||
|
|
||||||
Change:
|
|
||||||
```python
|
|
||||||
from spiderJobs.core.http_client import HTTPClient
|
|
||||||
```
|
|
||||||
|
|
||||||
To:
|
|
||||||
```python
|
|
||||||
from crawler_core.http_client import HTTPClient
|
|
||||||
```
|
|
||||||
|
|
||||||
Everything else stays identical to `spiderJobs/core/base.py`:
|
|
||||||
- `ApiResult` dataclass with fields: `success`, `status_code`, `data`, `list`, `count`, `is_end_page`, `error`
|
|
||||||
- `parse_response(http_code, raw)` function
|
|
||||||
- `BaseFetcher` class with `ENDPOINT`, `__init__`, `_build_params`, `_parse`, `fetch`
|
|
||||||
- `BaseSearcher` class with `ENDPOINT`, `__init__`, `_build_params`, `_request`, `_parse`, `search`, `load_all`
|
|
||||||
|
|
||||||
Add module docstring at the top:
|
Add module docstring at the top:
|
||||||
```python
|
```python
|
||||||
"""
|
"""
|
||||||
crawler_core.base — 通用基类与数据结构
|
crawler_core.base — 通用基类与数据结构
|
||||||
|
|
||||||
提供所有招聘平台共用的: ApiResult, BaseFetcher, BaseSearcher, parse_response
|
提供所有招聘平台共用的: Result, BaseFetcher, BaseSearcher, parse_response
|
||||||
不依赖任何平台特定代码。
|
不依赖任何平台特定代码。
|
||||||
"""
|
"""
|
||||||
```
|
```
|
||||||
|
|
||||||
Replace the existing inline print in `load_all`:
|
**Step 1: Generic Result[T] dataclass (replaces ApiResult — per D-07)**
|
||||||
```python
|
|
||||||
# Change this:
|
|
||||||
print(f"第 {page_index} 页失败: {result.error}")
|
|
||||||
|
|
||||||
# To this (use stdlib logging, not print):
|
|
||||||
import logging as _logging
|
|
||||||
_log = _logging.getLogger("crawler_core.base")
|
|
||||||
_log.warning("第 %d 页失败: %s", page_index, result.error)
|
|
||||||
```
|
|
||||||
|
|
||||||
Actually, define the logger at module level (not inside the method):
|
|
||||||
```python
|
```python
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Generic, Optional, TypeVar
|
||||||
|
|
||||||
|
from crawler_core.http_client import HTTPClient
|
||||||
|
|
||||||
|
T = TypeVar("T")
|
||||||
|
|
||||||
_logger = logging.getLogger("crawler_core.base")
|
_logger = logging.getLogger("crawler_core.base")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Result(Generic[T]):
|
||||||
|
"""Typed result wrapper returned by all BaseFetcher and BaseSearcher methods.
|
||||||
|
|
||||||
|
Replaces the untyped ApiResult. Callers annotate as Result[MyJobModel] etc.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
status_code: int
|
||||||
|
data: Optional[T] = None
|
||||||
|
list: list[T] = field(default_factory=list)
|
||||||
|
count: int = 0
|
||||||
|
is_end_page: bool = True
|
||||||
|
error: Optional[str] = None
|
||||||
```
|
```
|
||||||
|
|
||||||
Then in `load_all`, replace `print(...)` with `_logger.warning(...)`.
|
**Step 2: parse_response — adapt from spiderJobs/core/base.py but return Result[Any]**
|
||||||
|
|
||||||
|
Port `parse_response(http_code, raw)` from `spiderJobs/core/base.py` verbatim, changing only the return type annotation from `ApiResult` to `Result[Any]`.
|
||||||
|
|
||||||
|
**Step 3: BaseFetcher — 4 template methods (per D-06)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
class BaseFetcher:
|
||||||
|
"""Template-method base class for single-item fetchers.
|
||||||
|
|
||||||
|
Required overrides: _build_params(), _parse()
|
||||||
|
Optional overrides: _build_headers(), _check_blocked()
|
||||||
|
"""
|
||||||
|
ENDPOINT: str = ""
|
||||||
|
|
||||||
|
def __init__(self, http_client: HTTPClient) -> None:
|
||||||
|
self.http_client = http_client
|
||||||
|
|
||||||
|
# --- Required template methods ---
|
||||||
|
|
||||||
|
def _build_params(self) -> dict:
|
||||||
|
"""Build query/body parameters for the request. MUST be overridden."""
|
||||||
|
raise NotImplementedError(f"{type(self).__name__} must implement _build_params()")
|
||||||
|
|
||||||
|
def _parse(self, http_code: int, raw: Any) -> Result:
|
||||||
|
"""Parse the HTTP response into a Result. MUST be overridden."""
|
||||||
|
raise NotImplementedError(f"{type(self).__name__} must implement _parse()")
|
||||||
|
|
||||||
|
# --- Optional template methods ---
|
||||||
|
|
||||||
|
def _build_headers(self) -> dict:
|
||||||
|
"""Build extra request headers. Override to add platform-specific headers.
|
||||||
|
|
||||||
|
Default: returns {} (no extra headers beyond HTTPClient defaults).
|
||||||
|
"""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _check_blocked(self, status_code: int, body: str) -> bool:
|
||||||
|
"""Detect platform-specific anti-crawl blocks.
|
||||||
|
|
||||||
|
Override to inspect response body/status for block signals.
|
||||||
|
Default: returns False (assume not blocked).
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# --- Orchestration ---
|
||||||
|
|
||||||
|
def fetch(self) -> Result:
|
||||||
|
"""Execute the fetch: build params → request → check blocked → parse."""
|
||||||
|
params = self._build_params()
|
||||||
|
extra_headers = self._build_headers()
|
||||||
|
http_code, raw = self.http_client.get(
|
||||||
|
self.ENDPOINT, params=params, headers=extra_headers or None
|
||||||
|
)
|
||||||
|
raw_str = str(raw) if not isinstance(raw, str) else raw
|
||||||
|
if self._check_blocked(http_code, raw_str):
|
||||||
|
return Result(success=False, status_code=http_code, error="blocked")
|
||||||
|
return self._parse(http_code, raw)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Step 4: BaseSearcher — 4 template methods (per D-06)**
|
||||||
|
|
||||||
|
```python
|
||||||
|
class BaseSearcher:
|
||||||
|
"""Template-method base class for paginated list searchers.
|
||||||
|
|
||||||
|
Required overrides: _build_params(), _parse()
|
||||||
|
Optional overrides: _build_headers(), _check_blocked()
|
||||||
|
"""
|
||||||
|
ENDPOINT: str = ""
|
||||||
|
|
||||||
|
def __init__(self, page_size: int = 15, http_client: Optional[HTTPClient] = None) -> None:
|
||||||
|
self.page_size = page_size
|
||||||
|
self.http_client = http_client
|
||||||
|
|
||||||
|
# --- Required template methods ---
|
||||||
|
|
||||||
|
def _build_params(self, page_index: int) -> dict:
|
||||||
|
"""Build pagination query params. MUST be overridden."""
|
||||||
|
raise NotImplementedError(f"{type(self).__name__} must implement _build_params()")
|
||||||
|
|
||||||
|
def _parse(self, http_code: int, raw: Any) -> Result:
|
||||||
|
"""Parse the HTTP response into a Result. MUST be overridden."""
|
||||||
|
raise NotImplementedError(f"{type(self).__name__} must implement _parse()")
|
||||||
|
|
||||||
|
# --- Optional template methods ---
|
||||||
|
|
||||||
|
def _build_headers(self) -> dict:
|
||||||
|
"""Build extra request headers. Override for platform-specific headers.
|
||||||
|
|
||||||
|
Default: returns {} (no extra headers beyond HTTPClient defaults).
|
||||||
|
"""
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def _check_blocked(self, status_code: int, body: str) -> bool:
|
||||||
|
"""Detect platform-specific anti-crawl blocks.
|
||||||
|
|
||||||
|
Override to inspect response body/status for block signals.
|
||||||
|
Default: returns False (assume not blocked).
|
||||||
|
"""
|
||||||
|
return False
|
||||||
|
|
||||||
|
# --- Orchestration ---
|
||||||
|
|
||||||
|
def _request(self, params: dict) -> tuple[int, Any]:
|
||||||
|
"""Execute a single HTTP request. Uses _build_headers() for extra headers."""
|
||||||
|
extra_headers = self._build_headers()
|
||||||
|
return self.http_client.get(
|
||||||
|
self.ENDPOINT, params=params, headers=extra_headers or None
|
||||||
|
)
|
||||||
|
|
||||||
|
def search(self, page_index: int = 1) -> Result:
|
||||||
|
"""Fetch a single page: build params → request → check blocked → parse."""
|
||||||
|
params = self._build_params(page_index)
|
||||||
|
http_code, raw = self._request(params)
|
||||||
|
raw_str = str(raw) if not isinstance(raw, str) else raw
|
||||||
|
if self._check_blocked(http_code, raw_str):
|
||||||
|
return Result(success=False, status_code=http_code, error="blocked")
|
||||||
|
return self._parse(http_code, raw)
|
||||||
|
|
||||||
|
def load_all(self, max_pages: int = 10, on_page=None) -> list:
|
||||||
|
"""Iterate pages until is_end_page=True or max_pages reached."""
|
||||||
|
all_items: list = []
|
||||||
|
for page_index in range(1, max_pages + 1):
|
||||||
|
result = self.search(page_index)
|
||||||
|
if not result.success:
|
||||||
|
_logger.warning("第 %d 页失败: %s", page_index, result.error)
|
||||||
|
break
|
||||||
|
all_items.extend(result.list)
|
||||||
|
if on_page:
|
||||||
|
on_page(page_index, result)
|
||||||
|
if result.is_end_page:
|
||||||
|
break
|
||||||
|
return all_items
|
||||||
|
```
|
||||||
|
|
||||||
**crawler_core/__init__.py:**
|
**crawler_core/__init__.py:**
|
||||||
|
|
||||||
@ -406,14 +537,14 @@ Then in `load_all`, replace `print(...)` with `_logger.warning(...)`.
|
|||||||
crawler_core — 招聘爬虫共享核心包
|
crawler_core — 招聘爬虫共享核心包
|
||||||
|
|
||||||
安装方式: pip install -e ./crawler_core
|
安装方式: pip install -e ./crawler_core
|
||||||
使用方式: from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient
|
使用方式: from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from crawler_core.base import ApiResult, BaseFetcher, BaseSearcher, parse_response
|
from crawler_core.base import Result, BaseFetcher, BaseSearcher, parse_response
|
||||||
from crawler_core.http_client import HTTPClient
|
from crawler_core.http_client import HTTPClient
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
"ApiResult",
|
"Result",
|
||||||
"BaseFetcher",
|
"BaseFetcher",
|
||||||
"BaseSearcher",
|
"BaseSearcher",
|
||||||
"HTTPClient",
|
"HTTPClient",
|
||||||
@ -424,7 +555,7 @@ __version__ = "0.1.0"
|
|||||||
```
|
```
|
||||||
|
|
||||||
**Do NOT:**
|
**Do NOT:**
|
||||||
- Change the logic of `BaseFetcher.fetch()`, `BaseSearcher.search()`, or `BaseSearcher.load_all()` beyond the logger swap
|
- Keep the old `ApiResult` name anywhere in crawler_core (it's fully replaced by `Result[T]`)
|
||||||
- Import from `spiderJobs.*` or `app.*`
|
- Import from `spiderJobs.*` or `app.*`
|
||||||
- Import loguru
|
- Import loguru
|
||||||
- Add any platform-specific code to base.py or __init__.py
|
- Add any platform-specific code to base.py or __init__.py
|
||||||
@ -433,26 +564,40 @@ __version__ = "0.1.0"
|
|||||||
<automated>cd /Users/win/2025/AICoding/JobData && python -c "
|
<automated>cd /Users/win/2025/AICoding/JobData && python -c "
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, '.')
|
sys.path.insert(0, '.')
|
||||||
from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient, parse_response
|
from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient, parse_response
|
||||||
import dataclasses
|
import dataclasses, typing
|
||||||
fields = {f.name for f in dataclasses.fields(ApiResult)}
|
fields = {f.name for f in dataclasses.fields(Result)}
|
||||||
assert fields == {'success','status_code','data','list','count','is_end_page','error'}, f'ApiResult fields wrong: {fields}'
|
assert fields == {'success','status_code','data','list','count','is_end_page','error'}, f'Result fields wrong: {fields}'
|
||||||
assert hasattr(BaseFetcher, 'fetch'), 'BaseFetcher.fetch missing'
|
assert hasattr(BaseFetcher, 'fetch'), 'BaseFetcher.fetch missing'
|
||||||
|
assert hasattr(BaseFetcher, '_build_headers'), 'BaseFetcher._build_headers missing'
|
||||||
|
assert hasattr(BaseFetcher, '_check_blocked'), 'BaseFetcher._check_blocked missing'
|
||||||
|
assert BaseFetcher._build_headers(object()) == {}, '_build_headers default must return {}'
|
||||||
|
assert BaseFetcher._check_blocked(object(), 200, '') == False, '_check_blocked default must return False'
|
||||||
assert hasattr(BaseSearcher, 'load_all'), 'BaseSearcher.load_all missing'
|
assert hasattr(BaseSearcher, 'load_all'), 'BaseSearcher.load_all missing'
|
||||||
print('All imports OK, ApiResult fields OK')
|
assert hasattr(BaseSearcher, '_build_headers'), 'BaseSearcher._build_headers missing'
|
||||||
|
assert hasattr(BaseSearcher, '_check_blocked'), 'BaseSearcher._check_blocked missing'
|
||||||
|
print('All imports OK, Result fields OK, 4 template methods verified')
|
||||||
"</automated>
|
"</automated>
|
||||||
</verify>
|
</verify>
|
||||||
<acceptance_criteria>
|
<acceptance_criteria>
|
||||||
- `from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient` succeeds (with repo root on sys.path)
|
- `from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient` succeeds (with repo root on sys.path)
|
||||||
|
- `crawler_core/base.py` defines `Result` as a generic dataclass using `TypeVar` and `Generic[T]`
|
||||||
|
- `crawler_core/base.py` does NOT contain `ApiResult` anywhere: `grep "ApiResult" crawler_core/base.py` returns empty
|
||||||
- `crawler_core/base.py` does NOT contain `from spiderJobs` anywhere: `grep "from spiderJobs" crawler_core/base.py` returns empty
|
- `crawler_core/base.py` does NOT contain `from spiderJobs` anywhere: `grep "from spiderJobs" crawler_core/base.py` returns empty
|
||||||
- `crawler_core/base.py` does NOT contain `print(` anywhere: `grep "print(" crawler_core/base.py` returns empty
|
- `crawler_core/base.py` does NOT contain `print(` anywhere: `grep "print(" crawler_core/base.py` returns empty
|
||||||
- `crawler_core/__init__.py` contains `__all__` with all 5 exports
|
- `BaseFetcher._build_headers(self)` exists and returns `{}` by default
|
||||||
|
- `BaseFetcher._check_blocked(self, status_code, body)` exists and returns `False` by default
|
||||||
|
- `BaseFetcher.fetch()` calls `_build_headers()` and `_check_blocked()` in its implementation
|
||||||
|
- `BaseSearcher._build_headers(self)` exists and returns `{}` by default
|
||||||
|
- `BaseSearcher._check_blocked(self, status_code, body)` exists and returns `False` by default
|
||||||
|
- `BaseSearcher.search()` calls `_check_blocked()` in its implementation
|
||||||
|
- `crawler_core/__init__.py` exports `Result` (not `ApiResult`) in `__all__`
|
||||||
- `crawler_core/__init__.py` contains `__version__ = "0.1.0"`
|
- `crawler_core/__init__.py` contains `__version__ = "0.1.0"`
|
||||||
- `ApiResult` dataclass has exactly 7 fields: success, status_code, data, list, count, is_end_page, error
|
- `Result` dataclass has exactly 7 fields: success, status_code, data, list, count, is_end_page, error
|
||||||
- `BaseFetcher._build_params` raises `NotImplementedError`
|
- `BaseFetcher._build_params` raises `NotImplementedError`
|
||||||
- `BaseSearcher._build_params` raises `NotImplementedError`
|
- `BaseSearcher._build_params` raises `NotImplementedError`
|
||||||
</acceptance_criteria>
|
</acceptance_criteria>
|
||||||
<done>base.py ported (no spiderJobs imports, no print statements), __init__.py exposes clean public API.</done>
|
<done>base.py uses Result[T] generic (no ApiResult), 4 template methods wired into fetch()/search(), __init__.py exports clean public API.</done>
|
||||||
</task>
|
</task>
|
||||||
|
|
||||||
</tasks>
|
</tasks>
|
||||||
@ -465,10 +610,10 @@ cd /Users/win/2025/AICoding/JobData
|
|||||||
python -c "
|
python -c "
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0, '.')
|
sys.path.insert(0, '.')
|
||||||
from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient, parse_response
|
from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient, parse_response
|
||||||
|
|
||||||
# Verify ApiResult structure
|
# Verify Result structure
|
||||||
r = ApiResult(success=True, status_code=200)
|
r = Result(success=True, status_code=200)
|
||||||
assert r.success and r.list == [] and r.error is None
|
assert r.success and r.list == [] and r.error is None
|
||||||
|
|
||||||
# Verify BaseFetcher requires _build_params
|
# Verify BaseFetcher requires _build_params
|
||||||
@ -476,6 +621,13 @@ class TestFetcher(BaseFetcher):
|
|||||||
ENDPOINT = '/test'
|
ENDPOINT = '/test'
|
||||||
def _build_params(self):
|
def _build_params(self):
|
||||||
return {'q': 'test'}
|
return {'q': 'test'}
|
||||||
|
def _parse(self, http_code, raw):
|
||||||
|
return Result(success=True, status_code=http_code)
|
||||||
|
|
||||||
|
# Verify default template method overrides
|
||||||
|
tf = TestFetcher(http_client=None)
|
||||||
|
assert tf._build_headers() == {}, '_build_headers default failed'
|
||||||
|
assert tf._check_blocked(200, '') == False, '_check_blocked default failed'
|
||||||
|
|
||||||
# Verify parse_response with dict input
|
# Verify parse_response with dict input
|
||||||
result = parse_response(200, {'statusCode': 200, 'data': {'list': [{'id': 1}], 'count': 1, 'isEndPage': False}})
|
result = parse_response(200, {'statusCode': 200, 'data': {'list': [{'id': 1}], 'count': 1, 'isEndPage': False}})
|
||||||
@ -492,11 +644,12 @@ Also confirm no cross-contamination:
|
|||||||
grep -r "from spiderJobs" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found spiderJobs import" || echo "OK: no spiderJobs imports"
|
grep -r "from spiderJobs" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found spiderJobs import" || echo "OK: no spiderJobs imports"
|
||||||
grep -r "from app" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found app import" || echo "OK: no app imports"
|
grep -r "from app" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found app import" || echo "OK: no app imports"
|
||||||
grep -r "loguru" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found loguru" || echo "OK: no loguru"
|
grep -r "loguru" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: found loguru" || echo "OK: no loguru"
|
||||||
|
grep -r "ApiResult" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: ApiResult still present" || echo "OK: ApiResult fully replaced by Result[T]"
|
||||||
```
|
```
|
||||||
</verification>
|
</verification>
|
||||||
|
|
||||||
<success_criteria>
|
<success_criteria>
|
||||||
1. `python -c "from crawler_core import BaseFetcher, BaseSearcher, ApiResult, HTTPClient"` exits 0 (with repo root on sys.path)
|
1. `python -c "from crawler_core import BaseFetcher, BaseSearcher, Result, HTTPClient"` exits 0 (with repo root on sys.path)
|
||||||
2. `crawler_core/pyproject.toml` passes `python -c "import tomllib; tomllib.load(open('crawler_core/pyproject.toml','rb'))"`
|
2. `crawler_core/pyproject.toml` passes `python -c "import tomllib; tomllib.load(open('crawler_core/pyproject.toml','rb'))"`
|
||||||
3. `grep "requests_go" Pipfile` has output — dependency declared
|
3. `grep "requests_go" Pipfile` has output — dependency declared
|
||||||
4. `grep "tenacity" Pipfile` has output — dependency declared
|
4. `grep "tenacity" Pipfile` has output — dependency declared
|
||||||
@ -504,13 +657,15 @@ grep -r "loguru" /Users/win/2025/AICoding/JobData/crawler_core/ && echo "FAIL: f
|
|||||||
6. `grep -r "from spiderJobs" crawler_core/` has NO output
|
6. `grep -r "from spiderJobs" crawler_core/` has NO output
|
||||||
7. `grep -r "loguru" crawler_core/` has NO output
|
7. `grep -r "loguru" crawler_core/` has NO output
|
||||||
8. `grep "min=10" crawler_core/http_client.py` has output — anti-detection delay preserved
|
8. `grep "min=10" crawler_core/http_client.py` has output — anti-detection delay preserved
|
||||||
9. `spiderJobs/` and `jobs_spider/` directories are UNCHANGED (no files modified)
|
9. `grep -r "ApiResult" crawler_core/` has NO output — fully replaced by Result[T]
|
||||||
|
10. `BaseFetcher._build_headers` and `BaseFetcher._check_blocked` exist and are wired into `fetch()`
|
||||||
|
11. `spiderJobs/` and `jobs_spider/` directories are UNCHANGED (no files modified)
|
||||||
</success_criteria>
|
</success_criteria>
|
||||||
|
|
||||||
<output>
|
<output>
|
||||||
After completion, create `.planning/phases/01-shared-core/01-01-SUMMARY.md` with:
|
After completion, create `.planning/phases/01-shared-core/01-01-SUMMARY.md` with:
|
||||||
- What was created (file list with line counts)
|
- What was created (file list with line counts)
|
||||||
- Key decisions made (pyproject.toml structure, tenacity config values, logging approach)
|
- Key decisions made (pyproject.toml structure, tenacity config values, logging approach)
|
||||||
- Interface contracts (the public exports from crawler_core/__init__.py)
|
- Interface contracts (the public exports from crawler_core/__init__.py, Result[T] field list, 4 template method signatures)
|
||||||
- Any deviations from this plan and why
|
- Any deviations from this plan and why
|
||||||
</output>
|
</output>
|
||||||
|
|||||||
@ -125,7 +125,7 @@ class ZhilianSign:
|
|||||||
|
|
||||||
<tasks>
|
<tasks>
|
||||||
|
|
||||||
<task type="auto" tdd="true">
|
<task type="auto">
|
||||||
<name>Task 1: Port sign algorithms to crawler_core/ platform directories</name>
|
<name>Task 1: Port sign algorithms to crawler_core/ platform directories</name>
|
||||||
<read_first>
|
<read_first>
|
||||||
- /Users/win/2025/AICoding/JobData/spiderJobs/platforms/boss/sign.py (source — read every line before writing)
|
- /Users/win/2025/AICoding/JobData/spiderJobs/platforms/boss/sign.py (source — read every line before writing)
|
||||||
@ -138,20 +138,6 @@ class ZhilianSign:
|
|||||||
crawler_core/qcwy/sign.py
|
crawler_core/qcwy/sign.py
|
||||||
crawler_core/zhilian/sign.py
|
crawler_core/zhilian/sign.py
|
||||||
</files>
|
</files>
|
||||||
<behavior>
|
|
||||||
- BossSign.generate_traceid("M-W") returns a 25-char string starting with "M-W"
|
|
||||||
- BossSign.generate_traceid("M-W") result matches regex r'^M-W[0-9a-f]{13}[0-9a-zA-Z]{6}[0-9a-zA-Z]{3}$'
|
|
||||||
- _compute_checksum produces exactly 3 characters from the _CHARS set
|
|
||||||
- _generate_uuid produces exactly 19 characters (13 hex + 6 base62)
|
|
||||||
- Job51Sign().build_sign_path("open/test", "GET") returns tuple of length 2
|
|
||||||
- Job51Sign().build_sign_path("open/test", "GET")[0] starts with "/open/test?api_key=51job×tamp="
|
|
||||||
- Job51Sign().build_sign_path("open/test", "GET")[1] is 64-char hex string (HMAC-SHA256)
|
|
||||||
- Job51Sign().build_sign_path("open/test", "POST", body={"k": "v"})[1] != Job51Sign().build_sign_path("open/test", "GET")[1] — method affects signature
|
|
||||||
- ZhilianSign().sign_headers() returns dict with exactly 9 keys
|
|
||||||
- ZhilianSign().sign_headers()["x-zp-business-system"] == "73"
|
|
||||||
- ZhilianSign().sign_params() returns dict with exactly 6 keys: at, rt, channel, platform, version, d
|
|
||||||
- ZhilianSign(at="tok123").sign_params()["at"] == "tok123"
|
|
||||||
</behavior>
|
|
||||||
<action>
|
<action>
|
||||||
Copy the three sign algorithm files to their new locations under crawler_core/, making only one change per file: update the module docstring to reference crawler_core.
|
Copy the three sign algorithm files to their new locations under crawler_core/, making only one change per file: update the module docstring to reference crawler_core.
|
||||||
|
|
||||||
@ -238,7 +224,7 @@ print('All sign algorithms imported and validated')
|
|||||||
<done>Three sign.py files in crawler_core/ — pure functions, no HTTP, no cross-imports from app or spiderJobs.</done>
|
<done>Three sign.py files in crawler_core/ — pure functions, no HTTP, no cross-imports from app or spiderJobs.</done>
|
||||||
</task>
|
</task>
|
||||||
|
|
||||||
<task type="auto" tdd="true">
|
<task type="auto">
|
||||||
<name>Task 2: Write sign algorithm unit tests</name>
|
<name>Task 2: Write sign algorithm unit tests</name>
|
||||||
<read_first>
|
<read_first>
|
||||||
- /Users/win/2025/AICoding/JobData/crawler_core/boss/sign.py (just created — read to understand exact exports)
|
- /Users/win/2025/AICoding/JobData/crawler_core/boss/sign.py (just created — read to understand exact exports)
|
||||||
@ -253,35 +239,6 @@ print('All sign algorithms imported and validated')
|
|||||||
tests/crawler_core/test_qcwy_sign.py
|
tests/crawler_core/test_qcwy_sign.py
|
||||||
tests/crawler_core/test_zhilian_sign.py
|
tests/crawler_core/test_zhilian_sign.py
|
||||||
</files>
|
</files>
|
||||||
<behavior>
|
|
||||||
Boss sign tests:
|
|
||||||
- test_traceid_format: generate_traceid() matches regex r'^M-W[0-9a-f]{13}[0-9a-zA-Z]{6}[0-9a-zA-Z]{3}$'
|
|
||||||
- test_traceid_length: generate_traceid() is exactly 25 chars
|
|
||||||
- test_traceid_custom_prefix: generate_traceid("X-Y") starts with "X-Y"
|
|
||||||
- test_traceid_uniqueness: two calls return different values
|
|
||||||
- test_compute_checksum_length: _compute_checksum(any 19-char string) returns 3 chars
|
|
||||||
- test_compute_checksum_chars: all 3 chars are in _CHARS (base62)
|
|
||||||
- test_compute_checksum_deterministic: same input → same output
|
|
||||||
- test_generate_uuid_length: _generate_uuid() returns 19 chars
|
|
||||||
|
|
||||||
Job51 sign tests:
|
|
||||||
- test_build_sign_path_get_format: GET path starts with "/{endpoint}?api_key=51job×tamp="
|
|
||||||
- test_build_sign_path_returns_tuple: returns tuple of (str, str)
|
|
||||||
- test_sign_hex_length: sign is 64-char hex string matching r'^[0-9a-f]{64}$'
|
|
||||||
- test_get_vs_post_different_sign: same endpoint, different method → different sign
|
|
||||||
- test_sign_with_params_includes_params_in_path: GET with params={'k':'v'} → path contains "k=v"
|
|
||||||
- test_sign_key_in_path: path contains "api_key=51job"
|
|
||||||
- test_generate_uuid_length: generate_uuid() returns string of length 23 (13+10)
|
|
||||||
|
|
||||||
Zhilian sign tests:
|
|
||||||
- test_sign_headers_keys: returns dict with exactly these 9 keys: x-zp-at, x-zp-rt, x-zp-action-id, x-zp-page-code, x-zp-version, x-zp-channel, x-zp-platform, x-zp-device-id, x-zp-business-system
|
|
||||||
- test_sign_headers_business_system: x-zp-business-system == "73"
|
|
||||||
- test_sign_headers_tokens: x-zp-at and x-zp-rt reflect constructor args
|
|
||||||
- test_sign_params_keys: returns dict with exactly these 6 keys: at, rt, channel, platform, version, d
|
|
||||||
- test_sign_params_device_id_matches: d == device_id from constructor
|
|
||||||
- test_generate_uuid_format: matches UUID4 pattern r'^[0-9A-F]{8}-[0-9A-F]{4}-4[0-9A-F]{3}-[89AB][0-9A-F]{3}-[0-9A-F]{12}$'
|
|
||||||
- test_action_id_unique_per_call: two sign_headers() calls produce different x-zp-action-id values
|
|
||||||
</behavior>
|
|
||||||
<action>
|
<action>
|
||||||
Create `tests/crawler_core/__init__.py` (empty file) and three test files.
|
Create `tests/crawler_core/__init__.py` (empty file) and three test files.
|
||||||
|
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
### 包结构和安装方式
|
### 包结构和安装方式
|
||||||
- **D-01:** 包放在项目根目录 `crawler_core/`,与 `app/` 和 `spiderJobs/` 平级
|
- **D-01:** 包放在项目根目录 `crawler_core/`,与 `app/` 和 `spiderJobs/` 平级
|
||||||
- **D-02:** 使用 `pyproject.toml` 管理包元数据,支持 `pip install -e ./crawler_core`
|
- **D-02:** 使用 `pyproject.toml` 管理包元数据,支持 `pip install -e ./crawler_core`
|
||||||
- **D-03:** 最小依赖范围 — 只依赖 `requests_go` + Python 标准库,不拉入 FastAPI/Tortoise/loguru
|
- **D-03:** 最小依赖范围 — 只依赖 `requests_go` + `tenacity` + Python 标准库,不拉入 FastAPI/Tortoise/loguru
|
||||||
- **D-04:** 包名为 `crawler_core`
|
- **D-04:** 包名为 `crawler_core`
|
||||||
|
|
||||||
### 基类接口设计
|
### 基类接口设计
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user