全部完成

This commit is contained in:
win 2026-03-22 23:22:30 +08:00
parent 45fba5697e
commit 78eee99c2f
102 changed files with 46097 additions and 38935 deletions

116
.claude/index.json Normal file
View File

@ -0,0 +1,116 @@
{
"generated_at": "2026-03-20T00:00:00+08:00",
"project": "JobData - 招聘数据采集与统计分析平台",
"scan_coverage": {
"estimated_total_files": 220,
"scanned_files": 35,
"coverage_percent": 16,
"note": "node_modules 已排除覆盖全部核心业务文件node_modules 约占总文件数 90% 以上"
},
"ignored_directories": [
{ "path": "web/node_modules", "reason": ".gitignore 规则" },
{ "path": "__pycache__", "reason": ".gitignore 规则" },
{ "path": ".venv / venv", "reason": ".gitignore 规则" },
{ "path": "migrations", "reason": ".gitignore 规则" },
{ "path": "clickhouse_data / data", "reason": ".gitignore 规则" }
],
"modules": [
{
"path": "app",
"language": "Python 3.13",
"framework": "FastAPI 0.111 + Tortoise-ORM 0.23",
"entry": "app/__init__.py",
"startup_script": "run.py",
"config": "app/settings/config.py",
"api_prefix": "/api/v1",
"routes": [
"/base", "/user", "/role", "/menu", "/api", "/dept", "/auditlog",
"/job", "/universal", "/token", "/proxy", "/stats", "/pipeline",
"/keyword", "/cleaning", "/analytics", "/company"
],
"databases": {
"mysql": "Tortoise-ORM, 表: user/role/api/menu/dept/auditlog/boss_token/cleaning_*/metrics_*",
"clickhouse": "直接连接, 表: boss_job/boss_company/qcwy_job/qcwy_company/zhilian_job/zhilian_company/pending_company, 视图: job_analytics"
},
"scheduler_tasks": [
"stats_job (每6h)", "ecs_full_pipeline (每6h)",
"ip_alert_job (每10min)", "company_cleaning_job (每5min)",
"daily_cleanup_job (每天00:05)"
],
"test_exists": false,
"gaps": ["缺少单元测试", "缺少集成测试", "config.py 中有硬编码凭据"]
},
{
"path": "web",
"language": "JavaScript (Vue 3)",
"framework": "Vue 3.3 + Vite 4 + Naive UI + Pinia",
"entry": "web/src/main.js",
"build_cmd": "pnpm build",
"dev_cmd": "pnpm dev",
"key_views": [
"views/analytics/index.vue",
"views/cleaning/index.vue",
"views/recruitment/boss|qcwy|zhilian/index.vue",
"views/system/*"
],
"api_layer": "web/src/api/",
"state_management": "Pinia (user, permission, app, tags)",
"test_exists": false,
"gaps": ["缺少 Vitest 单元测试", "缺少 Playwright E2E 测试"]
},
{
"path": "jobs_spider",
"language": "Python",
"sub_modules": ["boss", "qcwy", "zhilian"],
"entry": {
"boss": "jobs_spider/boss/boos_api.py (死循环主入口)",
"qcwy": "jobs_spider/qcwy/run_company_search.py",
"zhilian": "jobs_spider/zhilian/company_spider.py"
},
"push_api": "POST /api/v1/universal/data/batch-store-async",
"anti_crawl": ["SmartIPManager (代理池轮换)", "随机延迟>=10s", "Session 重建", "Cookie 更新"],
"test_exists": false,
"gaps": ["缺少 IPAnomalyDetector 单元测试", "缺少数据解析函数测试"]
},
{
"path": "ecs_full_pipeline.py",
"language": "Python",
"description": "阿里云 ECS 实例批量创建/销毁/命令下发全流程脚本",
"entry": "ecs_full_pipeline.py",
"cloud": "Alibaba Cloud ECS (cn-qingdao-b, ecs.n1.tiny, 抢占实例)",
"gaps": ["AK/SK 在代码中硬编码(安全风险)"]
}
],
"security_issues": [
{
"severity": "HIGH",
"file": "app/settings/config.py",
"issue": "MySQL 连接串含密码、ClickHouse 密码、SMTP 密码硬编码"
},
{
"severity": "HIGH",
"file": "ecs_full_pipeline.py",
"issue": "阿里云 AK/SK 硬编码在 main() 函数中"
},
{
"severity": "MEDIUM",
"file": "app/settings/config.py",
"issue": "SECRET_KEY = 'CHANGE_ME_DEV_ONLY',生产环境需替换"
}
],
"coverage_gaps": [
"所有模块均无自动化测试(单元/集成/E2E",
"jobs_spider/qcwy 和 jobs_spider/zhilian 未深度扫描",
"app/services/company_cleaner.py 未扫描",
"web/src/views/system/* 子目录未逐一扫描"
],
"next_scan_recommendations": [
"补扫: app/services/company_cleaner.py",
"补扫: jobs_spider/qcwy/qcwy.py",
"补扫: jobs_spider/zhilian/zhilian_single.py",
"补扫: web/src/views/system/ 各子目录",
"补扫: app/core/algorithms/ 反爬虫算法",
"补扫: app/api/v1/cleaning/ 清洗接口实现"
],
"truncated": false
}

View File

@ -0,0 +1,371 @@
# 爬虫状态管理系统设计
## Context
当前爬虫系统存在两个核心缺陷:
1. **关键词消费不可恢复**`get_available()` 通过 `last_requested_date=today` 标记关键词已用,一旦爬虫崩溃,该关键词当天不会再被分配,已爬取的页面数据白白浪费。
2. **分页状态无持久化**:所有 3 个爬虫的分页逻辑都在内存中Boss MAX_PAGES=3, QCWY MAX_PAGES=50, Zhilian MAX_PAGES=15网络异常或进程重启后无法从断点恢复。
用户需求:关键词用完后标记不再重复请求 + 记录分页进度实现断点续爬。
---
## 架构决策
1. **扩展现有 keyword 表**而非新建表crawl 状态与 keyword 是 1:1 日粒度关系,新建表增加 JOIN 开销且无收益
2. **状态机驱动**`crawl_status` 字段控制关键词生命周期,替代简单的 date 比较
3. **服务端记录进度**:爬虫每完成一页向服务端汇报,而非本地记录(支持多机分布式爬取)
4. **过期检测**`crawling` 状态超时自动降级为 `partial`,防止僵死
---
## 实施步骤
### Step 1: 扩展 Keyword 模型
**修改** `app/models/keyword.py`
`BaseKeyword` 中新增字段:
```python
class BaseKeyword(Model):
id = fields.IntField(pk=True)
city = fields.CharField(max_length=64)
job = fields.CharField(max_length=128)
last_requested_date = fields.DateField(null=True)
last_requested_at = fields.DatetimeField(null=True)
# --- 新增:爬取状态管理 ---
crawl_status = fields.CharField(max_length=16, default="idle")
# 状态值: idle / crawling / completed / failed / partial
last_completed_page = fields.IntField(default=0) # 最后完成的页码
total_pages = fields.IntField(default=0) # 发现的总页数0=未知)
jobs_found = fields.IntField(default=0) # 累计发现的职位数
crawl_started_at = fields.DatetimeField(null=True) # 当次爬取开始时间
crawler_id = fields.CharField(max_length=64, default="") # 爬虫实例标识
error_message = fields.TextField(default="") # 最后错误信息
retry_count = fields.IntField(default=0) # 当天重试次数
created_at = fields.DatetimeField(auto_now_add=True)
updated_at = fields.DatetimeField(auto_now=True)
class Meta:
abstract = True
```
状态机流转:
```
idle ──(get_available)──► crawling ──(all pages done)──► completed
├──(spider reports error)──► failed ──(retry<3) crawling
└──(timeout 30min)──► partial ──(get_available)──► crawling
次日 00:00所有状态 → idle通过 last_requested_date != today 自动重置)
```
### Step 2: 重写 get_available() 控制器
**修改** `app/controllers/keyword.py``get_available()` 方法
优先级调度逻辑(替代当前的简单 date 过滤):
```
优先级 1: crawl_status='partial' AND last_requested_date=today (断点续爬)
优先级 2: crawl_status='failed' AND retry_count<3 AND last_requested_date=today (失败重试)
优先级 3: (last_requested_date!=today OR last_requested_date IS NULL) (全新关键词)
```
返回值增加 `last_completed_page``crawl_status`,使爬虫知道从哪页开始:
```python
items = [{
"id": r.id,
"city": r.city,
"job": r.job,
"last_completed_page": r.last_completed_page, # 新增
"crawl_status": r.crawl_status, # 新增
}]
```
认领时原子更新:
```python
update_fields = {
"last_requested_date": today,
"last_requested_at": now,
"crawl_status": "crawling",
"crawl_started_at": now,
"crawler_id": crawler_id, # 从请求参数获取
}
# 如果是全新关键词(非续爬),重置分页状态
if is_fresh:
update_fields["last_completed_page"] = 0
update_fields["total_pages"] = 0
update_fields["jobs_found"] = 0
update_fields["error_message"] = ""
update_fields["retry_count"] = 0
```
### Step 3: 新增进度汇报 API
**修改** `app/api/v1/keyword/keyword.py` — 新增 2 个端点
#### 3.1 页面进度汇报
```
POST /api/v1/keyword/page-progress
Body: {
"source": "boss",
"keyword_id": 123,
"page": 2,
"total_pages": 10, // 可选,爬虫发现的总页数
"jobs_found": 15 // 本页发现的职位数
}
```
控制器逻辑:
```python
async def report_page_progress(self, source, keyword_id, page, total_pages=None, jobs_found=0):
model = self._ensure_model(source)
update_data = {"last_completed_page": page}
if total_pages is not None and total_pages > 0:
update_data["total_pages"] = total_pages
# jobs_found 累加
await model.filter(id=keyword_id).update(
last_completed_page=page,
jobs_found=F("jobs_found") + jobs_found,
**({"total_pages": total_pages} if total_pages else {})
)
```
#### 3.2 爬取完成/失败汇报
```
POST /api/v1/keyword/crawl-complete
Body: {
"source": "boss",
"keyword_id": 123,
"status": "completed" | "failed",
"error_message": "optional error detail"
}
```
控制器逻辑:
```python
async def report_crawl_complete(self, source, keyword_id, status, error_message=""):
model = self._ensure_model(source)
update_data = {"crawl_status": status, "error_message": error_message}
if status == "failed":
# 使用 F 表达式原子递增 retry_count
await model.filter(id=keyword_id).update(
crawl_status="failed",
error_message=error_message,
)
# retry_count 单独递增
obj = await model.filter(id=keyword_id).first()
if obj:
obj.retry_count += 1
await obj.save(update_fields=["retry_count"])
else:
await model.filter(id=keyword_id).update(**update_data)
```
### Step 4: 新增请求 Schema
**修改** `app/schemas/keyword.py`(或新建)
```python
class PageProgressRequest(BaseModel):
source: str
keyword_id: int
page: int
total_pages: Optional[int] = None
jobs_found: int = 0
class CrawlCompleteRequest(BaseModel):
source: str
keyword_id: int
status: Literal["completed", "failed"]
error_message: str = ""
```
### Step 5: 过期爬取检测(定时任务)
**修改** `app/core/scheduler.py` — 新增 `stale_crawl_cleanup_job`
```python
# 每 10 分钟检查一次
async def stale_crawl_cleanup():
"""将超过 30 分钟仍为 crawling 状态的关键词降级为 partial"""
threshold = datetime.now() - timedelta(minutes=30)
for model in [BossKeyword, QcwyKeyword, ZhilianKeyword]:
count = await model.filter(
crawl_status="crawling",
crawl_started_at__lt=threshold,
).update(crawl_status="partial")
if count:
logger.info(f"{model.__name__}: {count} 条僵死爬取任务已标记为 partial")
```
### Step 6: 修改爬虫 — Boss 直聘
**修改** `jobs_spider/boss/boos_api.py`
#### 6.1 增强 fetch_service_params()
```python
def fetch_service_params() -> Optional[Dict[str, Any]]:
try:
url = f"{API_BASE_URL}/api/v1/keyword/available"
crawler_id = f"boss-{os.getpid()}-{os.getenv('HOSTNAME', 'local')}"
r = requests.get(url, params={
"source": "boss", "limit": 1, "reserve": True,
"crawler_id": crawler_id, # 新增
}, timeout=10)
# ... 解析逻辑 ...
item = items[0]
# 不再需要 mark-usedget_available 已原子标记)
return {
"query": item["job"],
"city": item["city"],
"scene": 1,
"page": item.get("last_completed_page", 0) + 1, # 断点续爬
"keyword_id": item["id"], # 新增
}
except Exception:
return None
```
#### 6.2 主循环添加进度汇报
```python
# 在 get_job_list_multi_pages 的每页完成后回调中:
def on_page_complete(page_num, jobs_count, keyword_id):
try:
requests.post(f"{API_BASE_URL}/api/v1/keyword/page-progress", json={
"source": "boss",
"keyword_id": keyword_id,
"page": page_num,
"jobs_found": jobs_count,
}, timeout=5)
except Exception:
pass # 汇报失败不影响主流程
# 全部完成后:
def on_crawl_done(keyword_id, success, error_msg=""):
try:
requests.post(f"{API_BASE_URL}/api/v1/keyword/crawl-complete", json={
"source": "boss",
"keyword_id": keyword_id,
"status": "completed" if success else "failed",
"error_message": error_msg,
}, timeout=5)
except Exception:
pass
```
### Step 7: 修改爬虫 — 前程无忧
**修改** `jobs_spider/qcwy/qcwy.py`
同 Boss 结构:
- `fetch_service_params()` 增加 `keyword_id``last_completed_page` 返回
- `crawl_multiple_pages()``start_page``last_completed_page + 1` 开始
- 每页完成后调用 `page-progress` API
- 全部完成/失败后调用 `crawl-complete` API
### Step 8: 修改爬虫 — 智联招聘
**修改** `jobs_spider/zhilian/zhilian_single.py`
同 Boss 结构:
- `fetch_service_params()` 增加 `keyword_id``last_completed_page` 返回
- `crawl_pc()` 的起始页从 `last_completed_page + 1` 开始
- 每页完成后调用 `page-progress` API
- 全部完成/失败后调用 `crawl-complete` API
### Step 9: 统计接口增强
**修改** `app/controllers/keyword.py``get_stats()`
返回值增加爬取状态分布:
```python
async def get_stats(self, source, on_date=None):
# ... 现有逻辑 ...
# 新增状态分布
crawling = await model.filter(crawl_status="crawling", last_requested_date=d).count()
completed = await model.filter(crawl_status="completed", last_requested_date=d).count()
failed = await model.filter(crawl_status="failed", last_requested_date=d).count()
partial = await model.filter(crawl_status="partial", last_requested_date=d).count()
return {
"data": {
"date": str(d), "total": total, "used": used, "unused": unused,
"crawl_status": {
"crawling": crawling,
"completed": completed,
"failed": failed,
"partial": partial,
}
}
}
```
### Step 10: 数据库迁移
执行 Aerich 迁移以在 MySQL keyword 表中添加新字段:
```bash
aerich migrate --name add_crawl_state_fields
aerich upgrade
```
或在 `init_app.py` 的自动迁移中由 Aerich 自动处理(`RUN_MIGRATIONS_ON_STARTUP=True`)。
---
## 关键文件清单
| 文件 | 操作 | 说明 |
|------|------|------|
| `app/models/keyword.py` | 修改 | 添加 8 个爬取状态字段 |
| `app/controllers/keyword.py` | 修改 | 重写 get_available() 优先级调度 + 新增 2 个方法 |
| `app/api/v1/keyword/keyword.py` | 修改 | 新增 page-progress / crawl-complete 端点 |
| `app/schemas/keyword.py` | 新建 | PageProgressRequest / CrawlCompleteRequest |
| `app/core/scheduler.py` | 修改 | 新增 stale_crawl_cleanup 定时任务 |
| `jobs_spider/boss/boos_api.py` | 修改 | 断点续爬 + 进度汇报 |
| `jobs_spider/qcwy/qcwy.py` | 修改 | 断点续爬 + 进度汇报 |
| `jobs_spider/zhilian/zhilian_single.py` | 修改 | 断点续爬 + 进度汇报 |
---
## 效果
| 指标 | 改造前 | 改造后 |
|------|--------|--------|
| 崩溃恢复 | 关键词丢失,当天不可恢复 | 自动从断点续爬 |
| 页面重复爬取 | 100%(整个关键词重爬) | 0%(精确到页级别) |
| 僵死任务检测 | 无 | 30 分钟自动降级 |
| 失败重试 | 无(关键词当天报废) | 最多 3 次自动重试 |
| 爬取进度可见性 | 无 | 实时可查stats API |
## 风险与缓解
| 风险 | 缓解措施 |
|------|----------|
| 字段迁移影响现有数据 | 所有新字段都有 default 值,迁移无破坏性 |
| 进度汇报增加 API 压力 | 汇报请求轻量(仅 UPDATE 单行),每页仅 1 次,可设 timeout=5s |
| 爬虫不升级导致状态不一致 | 新旧爬虫可共存:旧爬虫不汇报进度,关键词仍按 date 逻辑工作 |
| retry_count 无上限 | 硬限 3 次,超过 3 次的 failed 不再自动重试 |
## 验证方式
1. 启动应用,确认 Aerich 自动迁移新增字段成功
2. 手动调用 `GET /api/v1/keyword/available?source=boss` 验证返回 `last_completed_page``crawl_status`
3. 模拟断点:手动设置某关键词 `crawl_status=partial, last_completed_page=2`,再次 `get_available` 应优先返回该关键词
4. 启动 Boss 爬虫,观察日志确认从 `last_completed_page + 1` 开始
5. 强制 kill 爬虫,等待 30 分钟后确认 `stale_crawl_cleanup` 将状态降级为 `partial`
6. 重启爬虫,确认自动续爬

View File

@ -0,0 +1,317 @@
# 📋 实施计划:项目功能修复与优化
> 生成时间2026-03-20
> 工作目录:/Users/win/2025/AICoding/JobData
---
## 一、问题全景(为什么很多功能用不了)
经过深度代码审查,共发现 **22 个问题**,其中多个问题会直接导致功能完全不可用。以下按功能模块分组。
---
### 🔴 数据清洗功能6 个问题 → 功能基本不可用)
| # | 严重度 | 文件 | 行号 | 问题 | 影响 |
|---|--------|------|------|------|------|
| C1 | **严重** | `services/cleaning.py` | 13 | `from jobs_spider.qcwy.search_company_jobs import _extract_items` 引入私有函数,若该文件/函数不存在则整个模块 `ImportError`**所有清洗 API 直接 500** | 全部清洗功能不可用 |
| C2 | **严重** | `services/cleaning.py` | 多处 | 所有爬虫调用(`boss_service.get_job_detail_by_id` 等)是同步 HTTP 阻塞调用,在 `async def` 中直接执行,**阻塞整个事件循环** | 高并发时应用无响应 |
| C3 | **高** | `services/cleaning.py` | 28-36 | Boss Token 加载后 `_boss_token_loaded = True` 永不刷新Token 过期后 Boss 清洗**静默失败** | Boss 平台清洗失效 |
| C4 | **高** | `api/v1/cleaning/cleaning.py` | 285-308 | `process_task` API 无超时保护,爬虫卡住则 HTTP 连接永久挂起 | 客户端超时 |
| C5 | **高** | **前端缺失** | — | 后端菜单注册了 `/cleaning/index``/cleaning/monitor`,但 `web/src/views/` 下**不存在对应组件文件** | 点菜单白屏/404 |
| C6 | **中** | `api/v1/cleaning/cleaning.py` | 71-75 | `source`/`status` 直接拼入 ClickHouse SQLSQL 注入风险 | 安全漏洞 |
---
### 🔴 定时任务功能6 个问题 → 任务可能永久跳过)
| # | 严重度 | 文件 | 行号 | 问题 | 影响 |
|---|--------|------|------|------|------|
| S1 | **严重** | `core/locks.py` | 43 | 文件锁用 `os.mkdir` 实现,**无 TTL 过期机制**Worker 崩溃后锁目录永久残留,该任务**永久跳过** | 任务永久失效 |
| S2 | **严重** | `core/locks.py` | 38 | 异步函数中使用同步 `redis.Redis`**阻塞事件循环** | 全局性能问题 |
| S3 | **高** | `core/init_app.py` | — | 启动锁 `.startup_lock` 同样无 TTL崩溃后**迁移和种子数据初始化永不再执行** | 启动异常 |
| S4 | **高** | `core/locks.py` | 17 | 锁文件路径为**相对路径** `.lock_xxx`,多 Worker 以不同 CWD 启动时锁完全失效 | 任务并发执行 |
| S5 | **中** | `core/scheduler.py` | — | `stats_job``ecs_full_pipeline_job` 调度时间完全重合(`*/6h`),同时执行压力大 | 资源竞争 |
| S6 | **中** | `core/scheduler.py` | 181 | `company_cleaning_job` 处理 30 个公司可能超过 5 分钟调度间隔,任务堆积被 skip | 清洗停滞 |
---
### 🔴 安全问题4 个 → 凭据泄漏)
| # | 严重度 | 文件 | 行号 | 问题 |
|---|--------|------|------|------|
| X1 | **严重** | `ecs_full_pipeline.py` | 487-488 | 阿里云 AK/SK 硬编码在代码中,已在 git 历史里 |
| X2 | **严重** | `settings/config.py` | 44-52 | MySQL root 密码、SMTP 授权码、ClickHouse 密码硬编码 |
| X3 | **严重** | `services/job.py` | 535 | 第三方 API 签名 salt 硬编码 |
| X4 | **严重** | `core/dependency.py` | 26-28 | `token == "dev"` 开发后门在生产环境同样有效 |
---
### 🟡 IP 告警功能3 个问题)
| # | 严重度 | 文件 | 行号 | 问题 |
|---|--------|------|------|------|
| I1 | **中** | `core/scheduler.py` | 273 | 邮件模板用 `a.get('date')` 但实际字段是 `last_report_at`,告警日期列**永远为空** |
| I2 | **中** | `core/ip_tracking.py` | — | 中间件读 `response.body` 对流式响应无效IP 计数不准 |
| I3 | **低** | `core/ip_tracking.py` | 73 | `save()` 未指定 `update_fields`,并发写存在竞态 |
---
### 🟡 分析功能2 个问题)
| # | 严重度 | 文件 | 行号 | 问题 |
|---|--------|------|------|------|
| A1 | **高** | `api/v1/analytics.py` | — | `backports.zoneinfo` 未在 Pipfile 中声明,若 Python 3.8 则 `ImportError`,整个分析路由挂 |
| A2 | **低** | `api/v1/analytics.py` | — | `Query(regex=...)` 在 Pydantic v2 已弃用,应改 `pattern` |
---
### 🟡 Ruff 报告的代码缺陷(之前已诊断,此处不重复)
共 34 个 lint 错误,其中 3 个 F821未定义变量 `udt`/`fpt`/`json`)会导致运行时崩溃。
---
## 二、实施步骤(按优先级排序)
### Phase 1修复致命问题功能完全不可用
#### 1.1 修复文件锁 — 添加 TTL 过期机制
**文件**`app/core/locks.py`
```python
# 修改 _try_file_lock 方法
# 在 acquire 时写入时间戳到锁目录内的文件
# 在 acquire 失败时检查时间戳,若超过 TTL 则强制删除旧锁
async def acquire(self) -> bool:
# Redis 路径不变
if self._redis:
return bool(self._redis.set(self._key, "locked", nx=True, ex=self.ttl))
# 文件锁路径:改用绝对路径 + TTL 检查
lock_dir = Path(tempfile.gettempdir()) / f"jobdata_lock_{self.name}"
lock_meta = lock_dir / "meta"
try:
lock_dir.mkdir()
lock_meta.write_text(str(time.time()))
return True
except FileExistsError:
# 检查是否过期
if lock_meta.exists():
created = float(lock_meta.read_text())
if time.time() - created > self.ttl:
shutil.rmtree(lock_dir) # 强制清理过期锁
return await self.acquire() # 重试
return False
```
#### 1.2 修复 Redis 同步阻塞 → 异步
**文件**`app/core/locks.py`
`redis.Redis` 替换为 `redis.asyncio.Redis`,所有 `self._redis.set/get/delete` 改为 `await self._redis.set/get/delete`
#### 1.3 修复清洗模块 ImportError 风险
**文件**`app/services/cleaning.py:13`
```python
# 修改前
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
# 修改后:安全导入 + 降级
try:
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
except ImportError:
logger.warning("qcwy search_company_jobs 模块不可用,公司职位提取功能降级")
qcwy_extract_items = None
```
#### 1.4 修复清洗中同步阻塞调用
**文件**`app/services/cleaning.py` 多处
将所有同步爬虫调用包装为 `asyncio.to_thread`
```python
# 修改前
data = self.boss_service.get_job_detail_by_id(target)
# 修改后
data = await asyncio.to_thread(self.boss_service.get_job_detail_by_id, target)
```
涉及的方法:`clean_by_job_id``clean_by_company_name``clean_boss_company_jobs``clean_qcwy_company_jobs``clean_zhilian_company_jobs`(共约 12 处调用)。
`app/services/company_cleaner.py` 中同样的模式也需要修复(同步爬虫调用包装为 `to_thread`)。
#### 1.5 修复 Boss Token 永久缓存问题
**文件**`app/services/cleaning.py:28-36``app/services/company_cleaner.py:28-36`
```python
# 修改前
async def _ensure_boss_token_loaded(self) -> None:
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
return # 永不刷新
# 修改后:添加过期时间检查
async def _ensure_boss_token_loaded(self) -> None:
now = time.time()
if (self._boss_token_loaded
and self.boss_service.login_data.get("mpt")
and now - self._token_loaded_at < 3600): # 1小时刷新一次
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if token_obj:
self.boss_service.login_data["mpt"] = token_obj.mpt_value
self._boss_token_loaded = True
self._token_loaded_at = now
```
---
### Phase 2修复安全问题
#### 2.1 凭据迁移config.py
创建 `.env.example` + 修改 `config.py``pydantic-settings` 从环境变量读取(详见之前的 ruff-optimization 计划)。
#### 2.2 移除 dev 后门
**文件**`app/core/dependency.py:26-28`
```python
# 修改前
if token == "dev":
user = await User.filter().first()
return user
# 修改后:仅在开发环境允许
import os
if token == "dev" and os.getenv("APP_ENV", "production") == "development":
user = await User.filter().first()
return user
```
#### 2.3 阿里云 AK/SK 移入环境变量
**文件**`ecs_full_pipeline.py:487-488`
```python
# 修改前
ak = "LTAI5tBgW3hAzcnHBkZywxkD"
sk = "Il7M4bkJvdZIutkJH8pxhuMLrMvj5x"
# 修改后
ak = os.environ["ALIBABA_CLOUD_ACCESS_KEY_ID"]
sk = os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"]
```
---
### Phase 3修复 IP 告警和分析功能
#### 3.1 修复邮件模板字段名
**文件**`app/core/scheduler.py:273`
```python
# 修改前
f"<td>{a.get('date')}</td>"
# 修改后
f"<td>{a.get('last_report_at', 'N/A')}</td>"
```
#### 3.2 修复 analytics Query 参数弃用警告
**文件**`app/api/v1/analytics.py`
```python
# 修改前
interval: str = Query("day", regex="^(day|hour|week|month)$")
# 修改后
interval: str = Query("day", pattern="^(day|hour|week|month)$")
```
#### 3.3 修复 zoneinfo 导入
确认 Python 版本为 3.13(项目 Pipfile 声明),`zoneinfo` 是标准库,无需 `backports`。可直接删除 try/except只保留 `from zoneinfo import ZoneInfo`
---
### Phase 4修复 Ruff 34 个 lint 错误
```bash
# 自动修复 22 个
pipenv run ruff check app/ --fix
# 手动修复剩余 12 个F821 × 3、E722 × 1、E402 × 5、其他 × 3
```
F821 重点修复:
- `job.py:348``udt` 未定义(需确认应为 `update_date_time`
- `job.py:374``fpt` 未定义(需确认应为 `first_publish_time`
- `crawler/zhilian.py:60` — 添加 `import json`
---
### Phase 5代码去重和可维护性优化
1. 合并 `job.py` 中 7 个重复的 `_check_*_duplicate` 为 1 个通用方法
2. 删除死代码 `_check_qcwy_company_duplicate_by_name`
3. 将 `job.py``requests.post` 替换为 `httpx.AsyncClient`
4. 错调度时间:将 `ecs_full_pipeline_job` 偏移 30 分钟,避免与 `stats_job` 重合
---
## 三、关键文件索引
| 文件 | 操作 | Phase | 说明 |
|------|------|-------|------|
| `app/core/locks.py` | 重构 | 1 | 文件锁 TTL + Redis 异步化 |
| `app/services/cleaning.py` | 修复 | 1 | ImportError 防护 + async 阻塞 + Token 刷新 |
| `app/services/company_cleaner.py` | 修复 | 1 | async 阻塞 + Token 刷新 |
| `app/core/dependency.py` | 修复 | 2 | dev 后门加环境判断 |
| `app/settings/config.py` | 重构 | 2 | 凭据移入环境变量 |
| `ecs_full_pipeline.py` | 修复 | 2 | AK/SK 移入环境变量 |
| `app/core/scheduler.py` | 修复 | 3 | 邮件字段名 + 调度时间偏移 |
| `app/api/v1/analytics.py` | 修复 | 3 | regex→pattern + zoneinfo |
| `app/services/job.py` | 修复 | 4+5 | F821 + E722 + requests→httpx + 去重方法合并 |
| `app/services/crawler/zhilian.py` | 修复 | 4 | 添加 import json |
| `web/src/views/cleaning/` | 新建 | 5 | 创建前端清洗页面组件(可选) |
| `.env.example` | 新建 | 2 | 环境变量模板 |
---
## 四、风险与缓解
| 风险 | 缓解措施 |
|------|----------|
| 文件锁改造后旧锁目录残留 | 部署时手动清理 `.lock_*` 目录 |
| Redis 异步化后连接池配置不同 | 保持相同连接参数,仅换客户端类 |
| `asyncio.to_thread` 增加线程池压力 | 设置 `max_workers=10` 限制并发 |
| 凭据迁移后服务启动失败 | 先创建 `.env` 文件再部署 |
| 前端清洗页面组件工作量大 | 可先做最小 MVP列表 + 手动触发) |
---
## 五、执行顺序
```
Phase 12h → 修复致命问题:锁机制 + 清洗模块 + async 阻塞
Phase 21h → 安全问题:凭据迁移 + dev 后门
Phase 330m → IP 告警 + 分析功能修复
Phase 430m → Ruff 34 个 lint 错误
Phase 52h → 代码去重 + 前端组件(可选)
```
---
## SESSION_ID供 /ccg:execute 使用)
- CODEX_SESSION: N/A本次分析由 Claude 本地执行)
- GEMINI_SESSION: N/A

View File

@ -0,0 +1,213 @@
# 📋 实施计划Ruff 代码优化 + 项目质量提升
> 生成时间2026-03-20
> 工作目录:/Users/win/2025/AICoding/JobData
---
## 一、现状问题总览
### 1.1 Ruff 实际扫描结果34 个错误22 个可自动修复)
| 规则 | 数量 | 说明 | 可自动修复 |
|------|------|------|:---:|
| `F401` | 20 | 未使用的 import | ✅ |
| `E402` | 5 | import 不在文件顶部 | ❌ |
| `F541` | 3 | f-string 没有占位符 | ✅ |
| `F821` | 3 | 引用了未定义的变量名 | ❌ |
| `F811` | 2 | 重复定义(导入后又被覆盖) | ✅ |
| `E722` | 1 | 裸 `except:`(不捕获具体异常) | ❌ |
| **合计** | **34** | | **22 可自动修复** |
### 1.2 受影响文件清单
| 文件 | 问题数 | 最严重问题 |
|------|--------|-----------|
| `app/api/v1/token/token.py` | 8 | E402 + F811import 顺序混乱,重复定义) |
| `app/services/job.py` | 3 | **F821 未定义变量** `udt``fpt`**E722 裸 except** |
| `app/services/crawler/zhilian.py` | 1 | **F821 未定义变量** `json` |
| `app/services/company_cleaner.py` | 3 | F541 空 f-string |
| `app/services/crawler/__init__.py` | 3 | F401 无效的服务导出 |
| `app/repositories/clickhouse_repo.py` | 2 | F401 `math`, `Generator` |
| `app/schemas/token.py` | 2 | F401 `Dict`, `Any` |
| `app/controllers/job.py` | 1 | F401 `Optional` |
| `app/controllers/keyword.py` | 1 | F401 `CRUDBase` |
| `app/core/algorithms/antispider.py` | 1 | F401 `os` |
| `app/core/ip_tracking.py` | 1 | F401 `Any` |
| `app/core/locks.py` | 1 | F401 `time` |
| `app/api/v1/analytics.py` | 1 | F401 `List` |
| `app/api/v1/ingest/ingest.py` | 1 | F401 `Optional` |
| `app/schemas/analytics.py` | 1 | F401 `Any` |
| `app/services/crawler/boss.py` | 1 | F401 `os` |
---
### 1.3 Ruff 扫描之外的深层问题(代码审查发现)
#### 🔴 CRITICAL — 安全问题(硬编码凭据)
| 文件 | 行号 | 问题 |
|------|------|------|
| `app/settings/config.py` | ~23 | `SECRET_KEY = "CHANGE_ME_DEV_ONLY"` JWT 密钥 |
| `app/settings/config.py` | ~27-30 | ClickHouse 主机 IP、用户名、密码明文 |
| `app/settings/config.py` | ~44-45 | SMTP 真实邮箱账号 + 授权码明文 |
| `app/settings/config.py` | ~52 | MySQL root 密码 + 生产 IP 硬编码在连接串 |
| `app/services/job.py` | ~533-535 | 外部 API salt 硬编码 |
#### 🔴 HIGH — 性能问题(事件循环阻塞)
| 文件 | 行号 | 问题 |
|------|------|------|
| `app/services/job.py` | ~547 | `async def` 中调用同步 `requests.post` 阻塞事件循环 |
| `app/services/job.py` | ~926-933 | 串行逐条发送远程推送N 条数据 = N 次串行阻塞 |
| `app/core/locks.py` | ~38 | 同步 `redis.Redis``async` 方法中调用,阻塞事件循环 |
#### 🟡 MEDIUM — 代码质量
| 文件 | 问题 |
|------|------|
| `app/services/job.py` | 7 个 `_check_*_duplicate` 方法几乎完全重复,仅 SQL 参数不同 |
| `app/services/job.py` | 1 个死代码方法:`_check_qcwy_company_duplicate_by_name` 从未被调用 |
| `app/repositories/clickhouse_repo.py` | `group_by_column` 直接拼入 SQL潜在 SQL 注入) |
| `app/api/v1/__init__.py` | 同一 router 注册两次(`/job``/universal`OpenAPI 文档重复 |
| 全项目 | 零测试文件,关键业务逻辑(去重、路由分发)无任何测试保护 |
---
## 二、实施步骤
### Phase 1Ruff 自动修复低风险5 分钟)
```bash
# 自动修复 22 个可自动修复的问题
pipenv run ruff check app/ --fix
# 验证修复结果
pipenv run ruff check app/ --statistics
```
自动修复覆盖F401未使用 import、F541空 f-string、F811重复定义
### Phase 2手动修复 Ruff 报告的无法自动修复问题12 个)
#### 2.1 F821 未定义变量CRITICAL会导致运行时崩溃
**`app/services/job.py:348`** — 变量 `udt` 未定义
需要读取上下文,确认 `udt` 应该是什么(可能是 `update_date_time` 的缩写或某个局部变量)。
**`app/services/job.py:374`** — 变量 `fpt` 未定义
需要读取上下文,确认 `fpt` 应该是什么(可能是 `first_publish_time` 缩写)。
**`app/services/crawler/zhilian.py:60`** — `json` 模块未导入但被使用
修复:在文件顶部添加 `import json`
#### 2.2 E722 裸 except`app/services/job.py:302`
```python
# 修改前
except:
pass
# 修改后
except Exception as e:
logger.error(f"处理失败: {e}")
```
#### 2.3 E402 import 不在顶部(`app/api/v1/token/token.py:92-96`
将条件式 import 移至文件顶部,或使用 `TYPE_CHECKING` 保护块。
### Phase 3凭据安全CRITICAL建议本次一并完成
**目标**:将所有硬编码凭据移入环境变量
1. 在项目根目录创建 `.env.example`(安全模板)
2. 修改 `app/settings/config.py`,用 `pydantic-settings` 从环境变量读取所有敏感值
3. 启动时校验必填环境变量缺失则报错退出Fail Fast
4. 将 `.env` 加入 `.gitignore`(已有则确认)
```python
# config.py 改造后示例
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
SECRET_KEY: str # 必填,无默认值
CLICKHOUSE_HOST: str = "localhost"
CLICKHOUSE_USER: str = "default"
CLICKHOUSE_PASS: str # 必填
SMTP_USER: str = ""
SMTP_PASS: str = ""
DB_URL: str # 必填
class Config:
env_file = ".env"
```
### Phase 4性能修复async 阻塞问题)
1. 将 `app/services/job.py` 中的 `requests.post` 替换为 `httpx.AsyncClient`(已在依赖中)
2. 将 `_batch_send_to_remote_server` 改为 `asyncio.gather` 并发执行
3. 将 `app/core/locks.py` 中的同步 `redis.Redis` 替换为 `aioredis`(或 `redis.asyncio`
### Phase 5代码去重可维护性
合并 7 个重复的 `_check_*_duplicate` 方法为 1 个通用方法:
```python
async def _check_duplicate(
self,
table: str,
conditions: dict[str, str], # {"column_name": "value"}
days: int = 90
) -> bool:
...
```
删除死代码:`_check_qcwy_company_duplicate_by_name`
---
## 三、关键文件索引
| 文件 | 操作 | 说明 |
|------|------|------|
| `app/settings/config.py` | 重构 | 凭据移入环境变量 |
| `app/services/job.py` | 修复 | F821、E722、async 阻塞、方法去重 |
| `app/services/crawler/zhilian.py` | 修复 | 添加 `import json` |
| `app/api/v1/token/token.py` | 整理 | 修复 E402 import 顺序 |
| `app/services/company_cleaner.py` | 自动修复 | F541 空 f-string |
| `app/core/locks.py` | 修复 | 同步 redis → 异步 |
| `app/repositories/clickhouse_repo.py` | 修复 | 删除未用 import |
| `.env.example` | 新建 | 环境变量模板 |
---
## 四、风险与缓解
| 风险 | 缓解措施 |
|------|----------|
| 修复 F821 时误判变量用途 | 先读原函数完整逻辑再修复 |
| 凭据迁移后服务无法启动 | 先创建 `.env` 再重启服务 |
| async 改造引入新 bug | 修改后在本地运行完整功能测试 |
| 方法合并破坏去重逻辑 | 保持原有 SQL 逻辑不变,只提取公共参数 |
---
## 五、执行顺序建议
```
Phase 15min → pipenv run ruff check app/ --fix
Phase 230min → 手动修复 F821 × 3、E722 × 1、E402 × 5
Phase 360min → 凭据安全迁移(需配合运维创建 .env
Phase 490min → async 阻塞修复requests → httpx
Phase 560min → 去重方法合并(可选,不影响功能)
```
---
## SESSION_ID供 /ccg:execute 使用)
- CODEX_SESSION: N/A本次分析由 Claude 本地执行)
- GEMINI_SESSION: N/A

174
AGENTS.md Normal file
View File

@ -0,0 +1,174 @@
# JobData - 招聘数据采集与统计分析平台
## 变更记录 (Changelog)
| 版本 | 日期 | 说明 |
|------|------|------|
| 初始化 | 2026-03-20 | 首次生成架构文档,覆盖全部四个核心模块 |
---
## 项目愿景
JobData 是一个面向招聘市场的全栈数据采集与分析平台。系统从三大主流招聘平台Boss 直聘、前程无忧、智联招聘)自动抓取职位与公司数据,统一存储到 ClickHouse 列式数据库,并通过 FastAPI 后端 + Vue3 前端提供数据查看、定向清洗、统计分析等能力。ECS 弹性实例管理模块支持在阿里云上按需批量启停爬虫节点。
---
## 架构总览
```
[招聘平台] [爬虫层] [后端 API] [数据库]
Boss直聘 ──► jobs_spider/boss/ ──► ┌── MySQL
前程无忧 ──► jobs_spider/qcwy/ ──► app (FastAPI) ──► └── ClickHouse
智联招聘 ──► jobs_spider/zhilian/ ──►
web (Vue3) ────────┘
(前端页面)
ecs_full_pipeline.py ──► 阿里云 ECS ──► 批量启动爬虫节点
```
**核心技术栈**
| 层次 | 技术 |
|------|------|
| 后端 | Python 3.13, FastAPI 0.111, Tortoise-ORM 0.23, APScheduler |
| 数据库(业务) | MySQL用户/权限/审计/关键词/Token |
| 数据库(采集) | ClickHouse职位/公司 JSON 原始数据 + 分析视图) |
| 爬虫 | requests / httpx / PlaywrightPython 脚本) |
| 前端 | Vue 3.3, Vite 4, Naive UI, Pinia, ECharts |
| 基础设施 | 阿里云 ECS按量抢占实例APScheduler 定时任务 |
---
## 模块结构图
```mermaid
graph TD
ROOT["(根) JobData"] --> APP["app - FastAPI 后端"]
ROOT --> WEB["web - Vue3 前端"]
ROOT --> SPIDER["jobs_spider - 平台爬虫"]
ROOT --> ECS["ecs_full_pipeline.py - ECS 批量部署"]
APP --> APP_API["app/api - 路由层"]
APP --> APP_SVC["app/services - 业务逻辑"]
APP --> APP_CORE["app/core - 框架核心"]
APP --> APP_MODELS["app/models - ORM 模型"]
APP --> APP_REPO["app/repositories - 数据仓库"]
SPIDER --> SP_BOSS["jobs_spider/boss"]
SPIDER --> SP_QCWY["jobs_spider/qcwy"]
SPIDER --> SP_ZL["jobs_spider/zhilian"]
click APP "./app/AGENTS.md" "查看 app 模块文档"
click WEB "./web/AGENTS.md" "查看 web 模块文档"
click SPIDER "./jobs_spider/AGENTS.md" "查看 jobs_spider 模块文档"
```
---
## 模块索引
| 模块路径 | 语言 | 职责简述 |
|----------|------|----------|
| `app/` | Python | FastAPI 后端,提供 REST API、权限管理、定时任务、数据入库与分析 |
| `web/` | Vue3/JS | 前端管理界面,数据展示、关键词管理、代理管理、数据清洗操作 |
| `jobs_spider/` | Python | 三大平台的爬虫脚本,独立运行,结果通过 HTTP 推送到后端 |
| `ecs_full_pipeline.py` | Python | 阿里云 ECS 实例批量创建/销毁/命令下发全流程脚本 |
| `reclean_qcwy_jobs.py` | Python | 前程无忧数据重清洗独立脚本 |
---
## 运行与开发
### 后端启动
```bash
# 安装依赖pipenv
pipenv install
# 开发模式(默认端口 999920 个 worker
python run.py
# 环境变量覆盖
APP_HOST=0.0.0.0 APP_PORT=9999 UVICORN_WORKERS=4 python run.py
```
**关键环境变量**
| 变量 | 默认值 | 说明 |
|------|--------|------|
| `APP_HOST` | `0.0.0.0` | 监听地址 |
| `APP_PORT` | `9999` | 监听端口 |
| `UVICORN_WORKERS` | `20` | Worker 数量 |
| `CLICKHOUSE_HOST` | `121.4.126.241` | ClickHouse 地址(需修改为实际地址) |
| `CLICKHOUSE_USER` / `CLICKHOUSE_PASS` | 见 config.py | ClickHouse 认证 |
| `SMTP_HOST` / `SMTP_USER` / `SMTP_PASS` | 见 config.py | 邮件告警配置 |
| `REPORT_ENDPOINT` | 空 | 统计结果 Webhook 上报地址 |
| `RUN_MIGRATIONS_ON_STARTUP` | `True` | 是否启动时自动迁移 |
| `INITIALIZE_SEED_DATA_ON_STARTUP` | `True` | 是否启动时初始化种子数据 |
> 安全警告:`config.py``SECRET_KEY`、数据库连接串、SMTP 密码均为硬编码默认值,生产环境必须通过环境变量覆盖。
### 前端启动
```bash
cd web
pnpm install
pnpm dev # 开发模式,默认 http://localhost:5173
pnpm build # 构建产物到 web/dist
```
### ECS 批量爬虫部署
```bash
# 需配置阿里云凭据(环境变量或 ~/.alibabacloud/credentials
python ecs_full_pipeline.py
```
---
## 定时任务
APScheduler 在应用启动时注册以下任务(`app/core/scheduler.py`
| 任务 ID | 频率 | 职责 |
|---------|------|------|
| `stats_job` | 每 6 小时 | 统计 ClickHouse 各表总量并通过邮件/Webhook 上报 |
| `ecs_full_pipeline` | 每 6 小时 | 调用 `ecs_full_pipeline.py` 批量刷新爬虫节点 |
| `ip_alert_job` | 每 10 分钟 | 检查 IP 上报异常并告警 |
| `company_cleaning_job` | 每 5 分钟 | 自动清洗待处理公司数据collect 50 + process 30 |
| `daily_cleanup_job` | 每天 00:05 | 清理历史任务运行记录 |
所有任务通过分布式文件锁(或可选 Redis 锁)保证多 Worker 下只执行一次。
---
## 测试策略
- 当前代码库**无自动化测试文件**(缺口:单元测试、集成测试均缺失)。
- 推荐补充:
1. `app/services/` 的 service 层单元测试(使用 `pytest` + `anyio`
2. `app/api/v1/` 的 API 集成测试(使用 `httpx.AsyncClient`
3. `jobs_spider/` 的数据解析函数单元测试
---
## 编码规范
- Python使用 `ruff`(已在 Pipfile 中),格式化用 `black`,排序用 `isort`
- 前端ESLint`@zclzone` + `@unocss` 规则集),`prettier` 格式化。
- 类型:后端强制 `pydantic` Schema 做入参校验;前端以 JS 为主(未启用严格 TS
- 日志:后端统一使用 `loguru`,结构化字段 `logger.info(...)` 方式输出。
---
## AI 使用指引
- 修改爬虫逻辑时,重点关注反爬机制:`SmartIPManager``IPAnomalyDetector``jobs_spider/boss/boos_api.py` 中实现,随机延迟至少 10 秒。
- 新增 API 路由后需同步在 `app/api/v1/__init__.py` 注册,并执行 `api_controller.refresh_api()` 更新权限表。
- ClickHouse 表结构变更在 `app/core/clickhouse_init.py` 中维护,**不走 Aerich 迁移**。
- MySQL 模型变更走 Aerich`aerich migrate && aerich upgrade`)。
- 前端新增页面需要在 `web/src/views/{模块}/route.js` 和后端 `init_menus()` 中同步注册菜单。
- `config.py` 中已硬编码真实 MySQL/ClickHouse 连接串和 SMTP 凭据,**提交代码前务必确认不泄露敏感信息**。

502
Pipfile.lock generated
View File

@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
"sha256": "4661b2ece0fd4084c531ca39f080cddb1c3f4924207854b9ebeba24f5f092538"
"sha256": "06a87f34157b1b7a382087d3b63ddd42071655bd37056b98510d5937e62ee726"
},
"pipfile-spec": 6,
"requires": {
@ -89,6 +89,14 @@
"markers": "python_version >= '3.6'",
"version": "==21.2.0"
},
"async-timeout": {
"hashes": [
"sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c",
"sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"
],
"markers": "python_version >= '3.8'",
"version": "==5.0.1"
},
"asyncclick": {
"hashes": [
"sha256:be146a2d8075d4fe372ff4e877f23c8b5af269d16705c1948123b9415f6fd678"
@ -99,59 +107,67 @@
},
"asyncpg": {
"hashes": [
"sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba",
"sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70",
"sha256:0b448f0150e1c3b96cb0438a0d0aa4871f1472e58de14a3ec320dbb2798fb0d4",
"sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a",
"sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737",
"sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a",
"sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb",
"sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547",
"sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a",
"sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144",
"sha256:29ff1fc8b5bf724273782ff8b4f57b0f8220a1b2324184846b39d1ab4122031d",
"sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f",
"sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956",
"sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f",
"sha256:393af4e3214c8fa4c7b86da6364384c0d1b3298d45803375572f415b6f673f38",
"sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4",
"sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056",
"sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d",
"sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75",
"sha256:5b290f4726a887f75dcd1b3006f484252db37602313f806e9ffc4e5996cfe5cb",
"sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff",
"sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a",
"sha256:64e899bce0600871b55368b8483e5e3e7f1860c9482e7f12e0a771e747988168",
"sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e",
"sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3",
"sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad",
"sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773",
"sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4",
"sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed",
"sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305",
"sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33",
"sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708",
"sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf",
"sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a",
"sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590",
"sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454",
"sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e",
"sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f",
"sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3",
"sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851",
"sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af",
"sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e",
"sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af",
"sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0",
"sha256:f23b836dd90bea21104f69547923a02b167d999ce053f3d502081acea2fba15b",
"sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e",
"sha256:f86b0e2cd3f1249d6fe6fd6cfe0cd4538ba994e2d8249c0491925629b9104d0f",
"sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50",
"sha256:fd4406d09208d5b4a14db9a9dbb311b6d7aeeab57bded7ed2f8ea41aeef39b34"
"sha256:027eaa61361ec735926566f995d959ade4796f6a49d3bde17e5134b9964f9ba8",
"sha256:04d19392716af6b029411a0264d92093b6e5e8285ae97a39957b9a9c14ea72be",
"sha256:0b17c89312c2f4ccea222a3a6571f7df65d4ba2c0e803339bfc7bed46a96d3be",
"sha256:0bfbcc5b7ffcd9b75ab1558f00db2ae07db9c80637ad1b2469c43df79d7a5ae2",
"sha256:0c89ccf741c067614c9b5fc7f1fc6f3b61ab05ae4aaa966e6fd6b93097c7d20d",
"sha256:12b3b2e39dc5470abd5e98c8d3373e4b1d1234d9fbdedf538798b2c13c64460a",
"sha256:18c83b03bc0d1b23e6230f5bf8d4f217dc9bc08644ce0502a9d91dc9e634a9c7",
"sha256:19857a358fc811d82227449b7ca40afb46e75b33eb8897240c3839dd8b744218",
"sha256:1b41f1afb1033f2b44f3234993b15096ddc9cd71b21a42dbd87fc6a57b43d65d",
"sha256:22bc525ebbdc24d1261ecbf6f504998244d4e3be1721784b5f64664d61fbe602",
"sha256:22be6e02381bab3101cd502d9297ac71e2f966c86e20e78caead9934c98a8af6",
"sha256:2657204552b75f8288de08ca60faf4a99a65deef3a71d1467454123205a88fab",
"sha256:2d076d42eb583601179efa246c5d7ae44614b4144bc1c7a683ad1222814ed095",
"sha256:334dec28cf20d7f5bb9e45b39546ddf247f8042a690bff9b9573d00086e69cb5",
"sha256:37a58919cfef2448a920df00d1b2f821762d17194d0dbf355d6dde8d952c04f9",
"sha256:37fc6c00a814e18eef51833545d1891cac9aa69140598bb076b4cd29b3e010b9",
"sha256:3b1fbcb0e396a5ca435a8826a87e5c2c2cc0c8c68eb6fadf82168056b0e53a8c",
"sha256:3df118d94f46d85b2e434fd62c84cb66d5834d5a890725fe625f498e72e4d5ec",
"sha256:3faa62f997db0c9add34504a68ac2c342cfee4d57a0c3062fcf0d86c7f9cb1e8",
"sha256:480c4befbdf079c14c9ca43c8c5e1fe8b6296c96f1f927158d4f1e750aacc047",
"sha256:54a64f91839ba59008eccf7aad2e93d6e3de688d796f35803235ea1c4898ae1e",
"sha256:5a4af56edf82a701aece93190cc4e094d2df7d33f6e915c222fb09efbb5afc24",
"sha256:6d11b198111a72f47154fa03b85799f9be63701e068b43f84ac25da0bda9cb31",
"sha256:72d6bdcbc93d608a1158f17932de2321f68b1a967a13e014998db87a72ed3186",
"sha256:795416369c3d284e1837461909f58418ad22b305f955e625a4b3a2521d80a5f3",
"sha256:831712dd3cf117eec68575a9b50da711893fd63ebe277fc155ecae1c6c9f0f61",
"sha256:8df714dba348efcc162d2adf02d213e5fab1bd9f557e1305633e851a61814a7a",
"sha256:8ea599d45c361dfbf398cb67da7fd052affa556a401482d3ff1ee99bd68808a1",
"sha256:9322b563e2661a52e3cdbc93eed3be7748b289f792e0011cb2720d278b366ce2",
"sha256:98cc158c53f46de7bb677fd20c417e264fc02b36d901cc2a43bd6cb0dc6dbfd2",
"sha256:9ea33213ac044171f4cac23740bed9a3805abae10e7025314cfbd725ec670540",
"sha256:a429e842a3a4b4ea240ea52d7fe3f82d5149853249306f7ff166cb9948faa46c",
"sha256:a8d758dac9d2e723e173d286ef5e574f0b350ec00e9186fce84d0fc5f6a8e6b8",
"sha256:aad7a33913fb8bcb5454313377cc330fbb19a0cd5faa7272407d8a0c4257b671",
"sha256:b44c31e1efc1c15188ef183f287c728e2046abb1d26af4d20858215d50d91fad",
"sha256:ba5f8886e850882ff2c2ace5732300e99193823e8107e2c53ef01c1ebfa1e85d",
"sha256:bb223567dea5f47c45d347f2bde5486be8d9f40339f27217adb3fb1c3be51298",
"sha256:bc2b685f400ceae428f79f78b58110470d7b4466929a7f78d455964b17ad1008",
"sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3",
"sha256:bd5b6efff3c17c3202d4b37189969acf8927438a238c6257f66be3c426beba20",
"sha256:bdb957706da132e982cc6856bb2f7b740603472b54c3ebc77fe60ea3e57e1bd2",
"sha256:bef056aa502ee34204c161c72ca1f3c274917596877f825968368b2c33f585f4",
"sha256:c0807be46c32c963ae40d329b3a686356e417f674c976c07fa49f1b30303f109",
"sha256:c0e0822b1038dc7253b337b0f3f676cadc4ac31b126c5d42691c39691962e403",
"sha256:c1a9c5b71d2371a2290bc93336cd05ba4ec781683cab292adbddc084f89443c6",
"sha256:c1e1ab5bc65373d92dd749d7308c5b26fb2dc0fbe5d3bf68a32b676aa3bcd24a",
"sha256:c204fab1b91e08b0f47e90a75d1b3c62174dab21f670ad6c5d0f243a228f015b",
"sha256:c989386c83940bfbd787180f2b1519415e2d3d6277a70d9d0f0145ac73500735",
"sha256:cea3a0b2a14f95834cee29432e4ddc399b95700eb1d51bbc5bfee8f31fa07b2b",
"sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab",
"sha256:e009abc333464ff18b8f6fd146addffd9aaf63e79aa3bb40ab7a4c332d0c5e9e",
"sha256:e5d5098f63beeae93512ee513d4c0c53dc12e9aa2b7a1af5a81cddf93fe4e4da",
"sha256:e6974f36eb9a224d8fb428bcf66bd411aa12cf57c2967463178149e73d4de366",
"sha256:ebb3cde58321a1f89ce41812be3f2a98dddedc1e76d0838aba1d724f1e4e1a95",
"sha256:eee690960e8ab85063ba93af2ce128c0f52fd655fdff9fdb1a28df01329f031d",
"sha256:f6b56b91bb0ffc328c4e3ed113136cddd9deefdf5f79ab448598b9772831df44",
"sha256:f890de5e1e4f7e14023619399a471ce4b71f5418cd67a51853b9910fdfa73696"
],
"index": "pip_conf_index_global",
"markers": "python_full_version >= '3.8.0'",
"version": "==0.30.0"
"markers": "python_full_version >= '3.9.0'",
"version": "==0.31.0"
},
"black": {
"hashes": [
@ -308,6 +324,14 @@
"markers": "python_version >= '3.8'",
"version": "==2.0.0"
},
"exceptiongroup": {
"hashes": [
"sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219",
"sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"
],
"markers": "python_version >= '3.7'",
"version": "==1.3.1"
},
"fastapi": {
"hashes": [
"sha256:97ecbf994be0bcbdadedf88c3150252bed7b2087075ac99735403b1b76cc8fc0",
@ -326,6 +350,59 @@
"markers": "python_version >= '3.8'",
"version": "==0.0.7"
},
"greenlet": {
"hashes": [
"sha256:04633da773ae432649a3f092a8e4add390732cc9e1ab52c8ff2c91b8dc86f202",
"sha256:04e6a202cde56043fd355fefd1552c4caa5c087528121871d950eb4f1b51fa99",
"sha256:050703a60603db0e817364d69e048c70af299040c13a7e67792b9e62d4571196",
"sha256:0bc06a78fa3ffbe2a75f1ebc7e040eacf6fa1050a9432953ab111fbbbf0d03c1",
"sha256:0d2a78e6f1bf3f1672df91e212a2f8314e1e7c922f065d14cbad4bc815059467",
"sha256:15871afc0d78ec87d15d8412b337f287fc69f8f669346e391585824970931c48",
"sha256:2acb30e77042f747ca81f0a10cc153296567e92e666c5e1b117f4595afd43352",
"sha256:2c7429f6e9cea7cbf2637d86d3db12806ba970f7f972fcab39d6b54b4457cbaf",
"sha256:34cc7cf8ab6f4b85298b01e13e881265ee7b3c1daf6bc10a2944abc15d4f87c3",
"sha256:3828b309dfb1f117fe54867512a8265d8d4f00f8de6908eef9b885f4d8789062",
"sha256:393c03c26c865f17f31d8db2f09603fadbe0581ad85a5d5908b131549fc38217",
"sha256:4544ab2cfd5912e42458b13516429e029f87d8bbcdc8d5506db772941ae12493",
"sha256:45fcea7b697b91290b36eafc12fff479aca6ba6500d98ef6f34d5634c7119cbe",
"sha256:472841de62d60f2cafd60edd4fd4dd7253eb70e6eaf14b8990dcaf177f4af957",
"sha256:499b809e7738c8af0ff9ac9d5dd821cb93f4293065a9237543217f0b252f950a",
"sha256:5bf0d7d62e356ef2e87e55e46a4e930ac165f9372760fb983b5631bb479e9d3a",
"sha256:5ceb29d1f74c7280befbbfa27b9bf91ba4a07a1a00b2179a5d953fc219b16c42",
"sha256:60c06b502d56d5451f60ca665691da29f79ed95e247bcf8ce5024d7bbe64acb9",
"sha256:6712bfd520530eb67331813f7112d3ee18e206f48b3d026d8a96cd2d2ad20251",
"sha256:67725ae9fea62c95cf1aa230f1b8d4dc38f7cd14f6103d1df8a5a95657eb8e54",
"sha256:6dff6433742073e5b6ad40953a78a0e8cddcb3f6869e5ea635d29a810ca5e7d0",
"sha256:6e8fe0c72603201a86b2e038daf9b6c8570715f8779566419cff543b6ace88de",
"sha256:7123b29e6bad2f3f89681be4ef316480fca798ebe8d22fbaced9cc3775007a4f",
"sha256:752c896a8c976548faafe8a306d446c6a4c68d4fd24699b84d4393bd9ac69a8e",
"sha256:7d951e7d628a6e8b68af469f0fe4f100ef64c4054abeb9cdafbfaa30a920c950",
"sha256:87b791dd0e031a574249af717ac36f7031b18c35329561c1e0368201c18caf1f",
"sha256:a145f4b1c4ed7a2c94561b7f18b4beec3d3fb6f0580db22f7ed1d544e0620b34",
"sha256:a5e4b25e855800fba17713020c5c33e0a4b7a1829027719344f0c7c8870092a2",
"sha256:ac8db07bced2c39b987bba13a3195f8157b0cfbce54488f86919321444a1cc3c",
"sha256:acabf468466d18017e2ae5fbf1a5a88b86b48983e550e1ae1437b69a83d9f4ac",
"sha256:bd593db7ee1fa8a513a48a404f8cc4126998a48025e3f5cbbc68d51be0a6bf66",
"sha256:bdd67619cefe1cc9fcab57c8853d2bb36eca9f166c0058cc0d428d471f7c785c",
"sha256:c11fe0cfb0ce33132f0b5d27eeadd1954976a82e5e9b60909ec2c4b884a55382",
"sha256:c5445ddb7b586d870dad32ca9fc47c287d6022a528d194efdb8912093c5303ad",
"sha256:c816554eb33e7ecf9ba4defcb1fd8c994e59be6b4110da15480b3e7447ea4286",
"sha256:c8317d732e2ae0935d9ed2af2ea876fa714cf6f3b887a31ca150b54329b0a6e9",
"sha256:cc1d01bdd67db3e5711e6246e451d7a0f75fae7bbf40adde129296a7f9aa7cc9",
"sha256:ce8aed6fdd5e07d3cbb988cbdc188266a4eb9e1a52db9ef5c6526e59962d3933",
"sha256:d5583b2ffa677578a384337ee13125bdf9a427485d689014b39d638a4f3d8dbe",
"sha256:d7456e67b0be653dfe643bb37d9566cd30939c80f858e2ce6d2d54951f75b14a",
"sha256:dbe0e81e24982bb45907ca20152b31c2e3300ca352fdc4acbd4956e4a2cbc195",
"sha256:e3f03ddd7142c758ab41c18089a1407b9959bd276b4e6dfbd8fd06403832c87a",
"sha256:e66872daffa360b2537170b73ad530f14fa31785b1bc78080125d92edf0a6def",
"sha256:edbf4ab9a7057ee430a678fe2ef37ea5d69125d6bdc7feb42ed8d871c737e63b",
"sha256:f2cc88b50b9006b324c1b9f5f3552f9d4564c78af57cdfb4c7baf4f0aa089146",
"sha256:f96e2bb8a56b7e1aed1dbfbbe0050cb2ecca99c7c91892fd1771e3afab63b3e3",
"sha256:fd904626b8779810062cb455514594776e3cba3b8c0ba4939894df9f7b384971"
],
"markers": "python_version >= '3.9'",
"version": "==3.2.5"
},
"h11": {
"hashes": [
"sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d",
@ -545,83 +622,54 @@
},
"numpy": {
"hashes": [
"sha256:07b62978075b67eee4065b166d000d457c82a1efe726cce608b9db9dd66a73a5",
"sha256:087ffc25890d89a43536f75c5fe8770922008758e8eeeef61733957041ed2f9b",
"sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631",
"sha256:095737ed986e00393ec18ec0b21b47c22889ae4b0cd2d5e88342e08b01141f58",
"sha256:0a4f2021a6da53a0d580d6ef5db29947025ae8b35b3250141805ea9a32bbe86b",
"sha256:103ea7063fa624af04a791c39f97070bf93b96d7af7eb23530cd087dc8dbe9dc",
"sha256:11e58218c0c46c80509186e460d79fbdc9ca1eb8d8aee39d8f2dc768eb781089",
"sha256:122bf5ed9a0221b3419672493878ba4967121514b1d7d4656a7580cd11dddcbf",
"sha256:14a91ebac98813a49bc6aa1a0dfc09513dcec1d97eaf31ca21a87221a1cdcb15",
"sha256:1f91e5c028504660d606340a084db4b216567ded1056ea2b4be4f9d10b67197f",
"sha256:20b8200721840f5621b7bd03f8dcd78de33ec522fc40dc2641aa09537df010c3",
"sha256:240259d6564f1c65424bcd10f435145a7644a65a6811cfc3201c4a429ba79170",
"sha256:2738534837c6a1d0c39340a190177d7d66fdf432894f469728da901f8f6dc910",
"sha256:27c9f90e7481275c7800dc9c24b7cc40ace3fdb970ae4d21eaff983a32f70c91",
"sha256:293b2192c6bcce487dbc6326de5853787f870aeb6c43f8f9c6496db5b1781e45",
"sha256:2c3271cc4097beb5a60f010bcc1cc204b300bb3eafb4399376418a83a1c6373c",
"sha256:2f4f0215edb189048a3c03bd5b19345bdfa7b45a7a6f72ae5945d2a28272727f",
"sha256:3dcf02866b977a38ba3ec10215220609ab9667378a9e2150615673f3ffd6c73b",
"sha256:4209f874d45f921bde2cff1ffcd8a3695f545ad2ffbef6d3d3c6768162efab89",
"sha256:448a66d052d0cf14ce9865d159bfc403282c9bc7bb2a31b03cc18b651eca8b1a",
"sha256:4ae6863868aaee2f57503c7a5052b3a2807cf7a3914475e637a0ecd366ced220",
"sha256:4d002ecf7c9b53240be3bb69d80f86ddbd34078bae04d87be81c1f58466f264e",
"sha256:4e6ecfeddfa83b02318f4d84acf15fbdbf9ded18e46989a15a8b6995dfbf85ab",
"sha256:508b0eada3eded10a3b55725b40806a4b855961040180028f52580c4729916a2",
"sha256:546aaf78e81b4081b2eba1d105c3b34064783027a06b3ab20b6eba21fb64132b",
"sha256:572d5512df5470f50ada8d1972c5f1082d9a0b7aa5944db8084077570cf98370",
"sha256:5ad4ebcb683a1f99f4f392cc522ee20a18b2bb12a2c1c42c3d48d5a1adc9d3d2",
"sha256:66459dccc65d8ec98cc7df61307b64bf9e08101f9598755d42d8ae65d9a7a6ee",
"sha256:6936aff90dda378c09bea075af0d9c675fe3a977a9d2402f95a87f440f59f619",
"sha256:69779198d9caee6e547adb933941ed7520f896fd9656834c300bdf4dd8642712",
"sha256:6f1ae3dcb840edccc45af496f312528c15b1f79ac318169d094e85e4bb35fdf1",
"sha256:71669b5daae692189540cffc4c439468d35a3f84f0c88b078ecd94337f6cb0ec",
"sha256:72c6df2267e926a6d5286b0a6d556ebe49eae261062059317837fda12ddf0c1a",
"sha256:72dbebb2dcc8305c431b2836bcc66af967df91be793d63a24e3d9b741374c450",
"sha256:754d6755d9a7588bdc6ac47dc4ee97867271b17cee39cb87aef079574366db0a",
"sha256:76c3e9501ceb50b2ff3824c3589d5d1ab4ac857b0ee3f8f49629d0de55ecf7c2",
"sha256:7a0e27186e781a69959d0230dd9909b5e26024f8da10683bd6344baea1885168",
"sha256:7d6e390423cc1f76e1b8108c9b6889d20a7a1f59d9a60cac4a050fa734d6c1e2",
"sha256:8145dd6d10df13c559d1e4314df29695613575183fa2e2d11fac4c208c8a1f73",
"sha256:8446acd11fe3dc1830568c941d44449fd5cb83068e5c70bd5a470d323d448296",
"sha256:852ae5bed3478b92f093e30f785c98e0cb62fa0a939ed057c31716e18a7a22b9",
"sha256:87c930d52f45df092f7578889711a0768094debf73cfcde105e2d66954358125",
"sha256:8b1224a734cd509f70816455c3cffe13a4f599b1bf7130f913ba0e2c0b2006c0",
"sha256:8dc082ea901a62edb8f59713c6a7e28a85daddcb67454c839de57656478f5b19",
"sha256:906a30249315f9c8e17b085cc5f87d3f369b35fedd0051d4a84686967bdbbd0b",
"sha256:938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f",
"sha256:9c144440db4bf3bb6372d2c3e49834cc0ff7bb4c24975ab33e01199e645416f2",
"sha256:9e196ade2400c0c737d93465327d1ae7c06c7cb8a1756121ebf54b06ca183c7f",
"sha256:a3ef07ec8cbc8fc9e369c8dcd52019510c12da4de81367d8b20bc692aa07573a",
"sha256:a7af9ed2aa9ec5950daf05bb11abc4076a108bd3c7db9aa7251d5f107079b6a6",
"sha256:a9f66e7d2b2d7712410d3bc5684149040ef5f19856f20277cd17ea83e5006286",
"sha256:aa098a5ab53fa407fded5870865c6275a5cd4101cfdef8d6fafc48286a96e981",
"sha256:af58de8745f7fa9ca1c0c7c943616c6fe28e75d0c81f5c295810e3c83b5be92f",
"sha256:b05a89f2fb84d21235f93de47129dd4f11c16f64c87c33f5e284e6a3a54e43f2",
"sha256:b5e40e80299607f597e1a8a247ff8d71d79c5b52baa11cc1cce30aa92d2da6e0",
"sha256:b9d0878b21e3918d76d2209c924ebb272340da1fb51abc00f986c258cd5e957b",
"sha256:bc3186bea41fae9d8e90c2b4fb5f0a1f5a690682da79b92574d63f56b529080b",
"sha256:c63d95dc9d67b676e9108fe0d2182987ccb0f11933c1e8959f42fa0da8d4fa56",
"sha256:c771cfac34a4f2c0de8e8c97312d07d64fd8f8ed45bc9f5726a7e947270152b5",
"sha256:c8d9727f5316a256425892b043736d63e89ed15bbfe6556c5ff4d9d4448ff3b3",
"sha256:cbc95b3813920145032412f7e33d12080f11dc776262df1712e1638207dde9e8",
"sha256:cefc2219baa48e468e3db7e706305fcd0c095534a192a08f31e98d83a7d45fb0",
"sha256:d95f59afe7f808c103be692175008bab926b59309ade3e6d25009e9a171f7036",
"sha256:dd937f088a2df683cbb79dda9a772b62a3e5a8a7e76690612c2737f38c6ef1b6",
"sha256:de6ea4e5a65d5a90c7d286ddff2b87f3f4ad61faa3db8dabe936b34c2275b6f8",
"sha256:e0486a11ec30cdecb53f184d496d1c6a20786c81e55e41640270130056f8ee48",
"sha256:ee807923782faaf60d0d7331f5e86da7d5e3079e28b291973c545476c2b00d07",
"sha256:efc81393f25f14d11c9d161e46e6ee348637c0a1e8a54bf9dedc472a3fae993b",
"sha256:f0a1a8476ad77a228e41619af2fa9505cf69df928e9aaa165746584ea17fed2b",
"sha256:f75018be4980a7324edc5930fe39aa391d5734531b1926968605416ff58c332d",
"sha256:f92d6c2a8535dc4fe4419562294ff957f83a16ebdec66df0805e473ffaad8bd0",
"sha256:fb1752a3bb9a3ad2d6b090b88a9a0ae1cd6f004ef95f75825e2f382c183b2097",
"sha256:fc927d7f289d14f5e037be917539620603294454130b6de200091e23d27dc9be",
"sha256:fed5527c4cf10f16c6d0b6bee1f89958bccb0ad2522c8cadc2efd318bcd545f5"
"sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a",
"sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195",
"sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951",
"sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1",
"sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c",
"sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc",
"sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b",
"sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd",
"sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4",
"sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd",
"sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318",
"sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448",
"sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece",
"sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d",
"sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5",
"sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8",
"sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57",
"sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78",
"sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66",
"sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a",
"sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e",
"sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c",
"sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa",
"sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d",
"sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c",
"sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729",
"sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97",
"sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c",
"sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9",
"sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669",
"sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4",
"sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73",
"sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385",
"sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8",
"sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c",
"sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b",
"sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692",
"sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15",
"sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131",
"sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a",
"sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326",
"sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b",
"sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded",
"sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04",
"sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"
],
"markers": "python_version >= '3.11'",
"version": "==2.3.2"
"markers": "python_version >= '3.9'",
"version": "==2.0.2"
},
"openpyxl": {
"hashes": [
@ -725,52 +773,65 @@
},
"pandas": {
"hashes": [
"sha256:025e92411c16cbe5bb2a4abc99732a6b132f439b8aab23a59fa593eb00704232",
"sha256:09e3b1587f0f3b0913e21e8b32c3119174551deb4a4eba4a89bc7377947977e7",
"sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2",
"sha256:0f951fbb702dacd390561e0ea45cdd8ecfa7fb56935eb3dd78e306c19104b9b0",
"sha256:1b916a627919a247d865aed068eb65eb91a344b13f5b57ab9f610b7716c92de1",
"sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956",
"sha256:1d12f618d80379fde6af007f65f0c25bd3e40251dbd1636480dfffce2cf1e6da",
"sha256:22c2e866f7209ebc3a8f08d75766566aae02bcc91d196935a1d9e59c7b990ac9",
"sha256:2323294c73ed50f612f67e2bf3ae45aea04dce5690778e08a09391897f35ff88",
"sha256:2b0540963d83431f5ce8870ea02a7430adca100cec8a050f0811f8e31035541b",
"sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9",
"sha256:2eb789ae0274672acbd3c575b0598d213345660120a257b47b5dafdc618aec83",
"sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab",
"sha256:342e59589cc454aaff7484d75b816a433350b3d7964d7847327edda4d532a2e3",
"sha256:3462c3735fe19f2638f2c3a40bd94ec2dc5ba13abbb032dd2fa1f540a075509d",
"sha256:3583d348546201aff730c8c47e49bc159833f971c2899d6097bce68b9112a4f1",
"sha256:4645f770f98d656f11c69e81aeb21c6fca076a44bed3dcbb9396a4311bc7f6d8",
"sha256:4d544806b485ddf29e52d75b1f559142514e60ef58a832f74fb38e48d757b299",
"sha256:56a342b231e8862c96bdb6ab97170e203ce511f4d0429589c8ede1ee8ece48b8",
"sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444",
"sha256:689968e841136f9e542020698ee1c4fbe9caa2ed2213ae2388dc7b81721510d3",
"sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a",
"sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb",
"sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928",
"sha256:7dcb79bf373a47d2a40cf7232928eb7540155abbc460925c2c96d2d30b006eb4",
"sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a",
"sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22",
"sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275",
"sha256:98bcc8b5bf7afed22cc753a28bc4d9e26e078e777066bc53fac7904ddef9a678",
"sha256:9b7ff55f31c4fcb3e316e8f7fa194566b286d6ac430afec0d461163312c5841e",
"sha256:ac942bfd0aca577bef61f2bc8da8147c4ef6879965ef883d8e8d5d2dc3e744b8",
"sha256:b3cd4273d3cb3707b6fffd217204c52ed92859533e31dc03b7c5008aa933aaab",
"sha256:b4b0de34dc8499c2db34000ef8baad684cfa4cbd836ecee05f323ebfba348c7d",
"sha256:ca7ed14832bce68baef331f4d7f294411bed8efd032f8109d690df45e00c4679",
"sha256:cd05b72ec02ebfb993569b4931b2e16fbb4d6ad6ce80224a3ee838387d83a191",
"sha256:dd71c47a911da120d72ef173aeac0bf5241423f9bfea57320110a978457e069e",
"sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12",
"sha256:e6723a27ad7b244c0c79d8e7007092d7c8f0f11305770e2f4cd778b3ad5f9f85",
"sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9",
"sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96",
"sha256:fe67dc676818c186d5a3d5425250e40f179c2a89145df477dd82945eaea89e97",
"sha256:fe7317f578c6a153912bd2292f02e40c1d8f253e93c599e82620c7f69755c74f"
"sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7",
"sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593",
"sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5",
"sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791",
"sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73",
"sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec",
"sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4",
"sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5",
"sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac",
"sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084",
"sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c",
"sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87",
"sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35",
"sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250",
"sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c",
"sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826",
"sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9",
"sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713",
"sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1",
"sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523",
"sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3",
"sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78",
"sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53",
"sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c",
"sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21",
"sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5",
"sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff",
"sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45",
"sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110",
"sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493",
"sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b",
"sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450",
"sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86",
"sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8",
"sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98",
"sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89",
"sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66",
"sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b",
"sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8",
"sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29",
"sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6",
"sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc",
"sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2",
"sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788",
"sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa",
"sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151",
"sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838",
"sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b",
"sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a",
"sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d",
"sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908",
"sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0",
"sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b",
"sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c",
"sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee"
],
"index": "pip_conf_index_global",
"markers": "python_version >= '3.9'",
"version": "==2.3.1"
"version": "==2.3.3"
},
"passlib": {
"hashes": [
@ -798,6 +859,21 @@
"markers": "python_version >= '3.8'",
"version": "==4.3.6"
},
"playwright": {
"hashes": [
"sha256:1dd93b265688da46e91ecb0606d36f777f8eadcf7fbef12f6426b20bf0c9137c",
"sha256:284ed5a706b7c389a06caa431b2f0ba9ac4130113c3a779767dda758c2497bb1",
"sha256:38a1bae6c0a07839cdeaddbc0756b3b2b85e476c07945f64ece08f1f956a86f1",
"sha256:5f065f5a133dbc15e6e7c71e7bc04f258195755b1c32a432b792e28338c8335e",
"sha256:6caefb08ed2c6f29d33b8088d05d09376946e49a73be19271c8cd5384b82b14c",
"sha256:9351c1ac3dfd9b3820fe7fc4340d96c0d3736bb68097b9b7a69bd45d25e9370c",
"sha256:99104771abc4eafee48f47dac2369e0015516dc1ce8c409807d2dd440828b9a4",
"sha256:a4a9d65027bce48eeba842408bcc1421502dfd7e41e28d207e94260fa93ca67e"
],
"index": "pip_conf_index_global",
"markers": "python_version >= '3.9'",
"version": "==1.57.0"
},
"pycparser": {
"hashes": [
"sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6",
@ -932,6 +1008,21 @@
"markers": "python_version >= '3.8'",
"version": "==2.7.1"
},
"pyee": {
"hashes": [
"sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8",
"sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228"
],
"markers": "python_version >= '3.8'",
"version": "==13.0.1"
},
"pyexecjs": {
"hashes": [
"sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c"
],
"index": "pip_conf_index_global",
"version": "==1.5.1"
},
"pygments": {
"hashes": [
"sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f",
@ -959,6 +1050,16 @@
"markers": "python_version >= '3.8' and python_version < '4.0'",
"version": "==0.3.2"
},
"pysocks": {
"hashes": [
"sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
"sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
"sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
],
"index": "pip_conf_index_global",
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==1.7.1"
},
"python-dateutil": {
"hashes": [
"sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3",
@ -1202,6 +1303,59 @@
"markers": "python_version >= '3.8'",
"version": "==0.37.2"
},
"tomli": {
"hashes": [
"sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729",
"sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b",
"sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d",
"sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df",
"sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576",
"sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d",
"sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1",
"sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a",
"sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e",
"sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc",
"sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702",
"sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6",
"sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd",
"sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4",
"sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776",
"sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a",
"sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66",
"sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87",
"sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2",
"sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f",
"sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475",
"sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f",
"sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95",
"sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9",
"sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3",
"sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9",
"sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76",
"sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da",
"sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8",
"sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51",
"sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86",
"sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8",
"sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0",
"sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b",
"sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1",
"sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e",
"sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d",
"sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c",
"sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867",
"sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a",
"sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c",
"sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0",
"sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4",
"sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614",
"sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132",
"sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa",
"sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"
],
"markers": "python_version >= '3.8'",
"version": "==2.4.0"
},
"tortoise-orm": {
"hashes": [
"sha256:deaabed1619ea8aab6213508dff025571a701b7f34ee534473d7bb7661aa9f4f",
@ -1231,11 +1385,11 @@
},
"tzdata": {
"hashes": [
"sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8",
"sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"
"sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1",
"sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"
],
"markers": "python_version >= '2'",
"version": "==2025.2"
"version": "==2025.3"
},
"ujson": {
"hashes": [

589
aaa.json Normal file
View File

@ -0,0 +1,589 @@
{
"source_type": "智联招聘",
"name": "深圳市安保医疗科技股份有限公司",
"common_name": "深圳市安保医疗科技股份有限公司",
"title": "质量体系工程师",
"title_addr": "质量体系工程师",
"description": "岗位职责1、负责公司质量管理体系的策划运营维护和合规性的管理2、负责公司各类医疗器械质量管理体系国内GMP、ISO13485、QSR820、MDSAP、 MDR、ISO9001、 职业健康和安全,环境管理体系等)的运营、维护与升级; 3、负责公司各类外部体系审核工作国内飞检、CE审核、飞检等的内部协调与组织工作 4、负责生产、经营数据上报及负责监控不良事件上报 5、负责相关行业...",
"education": "本科",
"skill": "医疗器械质量管理体系,ISO认证,GMP认证,FDA认证,二三类医械经验",
"welfare": "",
"years": "3-5年",
"salary": "1.2-1.8万",
"location": "深圳宝安",
"position": "深圳宝安",
"job_type": "全职",
"size": "500-999人",
"employer_type": "民营",
"industry": "医疗设备/器械",
"job_1st_class": "",
"job_2nd_class": "",
"job_3rd_class": "",
"job_4th_class": "",
"date": "2026-01-22 11:23:37",
"start_date": "",
"end_date": "",
"age": "",
"sex": "",
"number": "1",
"url": "http://jobs.zhaopin.com/CC219609310J40829101009.htm",
"company_id": "21960931",
"company_name": "深圳市安保医疗科技股份有限公司",
"company_url": "http://company.zhaopin.com/CC219609310.htm",
"company_desc": "<p>深圳市安保医疗科技股份有限公司简称“安保医疗”成立于2001年是国家级高新技术企业、国家级专精特新重点“小巨人”企业深耕急救与生命支持领域二十余年致力于为全球医疗系统提供一体化综合解决方案。</p><p><br></p><p>作为国内首家危急重症一体化专业制造商,公司构建了完善的研发创新体系,拥有广东省及深圳市急救和生命支持类医疗设备工程技术研究中心、高端急危重症医疗设备广东省工程研究中心、深圳市高端医疗设备中小试基地、深圳市博士后创新实践基地等多层次科研平台。全面掌握了融合按压、气道管理、机械通气、电除颤等核心技术,实现了关键技术与核心部件的自主可控,推动高端医疗装备的国产化进程。</p><p><br></p><p>目前安保医疗已上市40余款生命支持设备涵盖心肺复苏机、急救转运呼吸机、除颤仪等关键产品拥有近1000项专利技术多项集成化设备达到国际先进水平部分技术全球领先。公司牵头承担了十余项国家及省市级重大科研项目与50余家顶级医院、30余所高校建立深度协同构建了产学研用一体化的创新生态。</p><p><br></p><p>公司凭借卓越的技术实力与创新能力先后荣获2023年深圳市科技进步奖一等奖、2018年广东省科技进步奖二等奖、2023年及2024年中国专利奖优秀奖、2024年广东省及深圳市制造业单项冠军等多项荣誉成为中国高端医疗装备自主创新与国产替代的中坚力量。</p>",
"base_data": {
"abroadFlag": 2,
"abroadTipInfo": {
"abroadTips": [],
"icon": "",
"title": ""
},
"adResponse": None,
"aiPositionRecommendLevel": "",
"aiPositionRecommendReason": "",
"alreadyCallPhone": False,
"applyType": "1",
"campusBestCompany": {
"bestCompanyUrl": "",
"homepageType": 0,
"logoTagUrl": "",
"state": 0
},
"campusJobDetail": None,
"campusJobMatchData": None,
"campusPositionCardTagInfo": None,
"campusRootOrgInfo": None,
"canBeRegular": False,
"canRemoteInternship": False,
"cardCustomJson": "{"address":" ","companyName":"","locationType":"1","salary60":"1.2-1.8","strengthLabel":"A"}",
"cardType": 1,
"chatWindow": 1,
"cityDistrict": "宝安",
"cityId": "765",
"commercialLabel": [],
"commonTrack": {
"trackCommercialFeature": "",
"trackSocialSearchEmergencyFeature": False
},
"companyId": 21960931,
"companyLogo": "https://rd5-public.zhaopin.cn/imgs/company/043dbec10b9ee8b8104f537b2280e6c4.jpg",
"companyName": "深圳市安保医疗科技股份有限公司",
"companyNumber": "CZ219609310",
"companyRootId": 21960931,
"companyScaleTypeTagsNew": [],
"companySize": "500-999人",
"companyUrl": "http://company.zhaopin.com/CC219609310.htm",
"complainFlag": False,
"deliveryPath": "",
"displayPhoneNumber": False,
"distance": 0.0,
"distanceFormat": "",
"distanceText": "",
"education": "本科",
"experimentInfo": None,
"extend": None,
"extensions": None,
"featureServer": {
"jdViews3d": "38",
"lastReplyTime": 1773997359829,
"lastReplyTimeText": "",
"staffAvgFirstResponseTime7d": 1332,
"staffAvgHandleResumeTime30d": 3681,
"staffHandleResumeCnts30d": 191,
"staffReplyRate30d": 0.48,
"todayReplyNum": 0,
"todayReplyNumText": ""
},
"feedOperation": None,
"feedPosition": None,
"financingStage": {
"code": 3,
"name": "A轮"
},
"firstPublishTime": "2026-01-22 11:23:37",
"hasAppliedPosition": False,
"industryCompanyTags": [
"500030000",
"1200030000"
],
"industryName": "医疗设备/器械",
"industryTags": [
"500210000"
],
"innerBusinessInfo": {
"customIndustryList": [
{
"itemCode": 500210000,
"name": "医疗设备/器械",
"standard": True
}
]
},
"internshipMonths": 0,
"isNewPosition": 0,
"jdCardType": 2,
"jobDetailData": {
"company": {
"base": None,
"companyAuditNature": None,
"companyComment": None,
"companyInterview": None,
"jumpDetail": None,
"orgBestRanking": None,
"orgReliableCompany": None,
"other": None,
"state": None
},
"companyProxy": {
"companyAddress": "",
"companyImage": "",
"companyName": "",
"companySize": "",
"entryCompanyTitle": ""
},
"customAttributeInfo": {
"platformRemind": "",
"reportItems": [],
"welfareItems": [],
"workTimeItems": []
},
"debug": {},
"experimentInfo": {
"blueCollarJobTitleExperimentInfo": None
},
"featureServer": None,
"imSection": None,
"internship": [],
"live": {
"liveItems": [],
"liveQuickFocusChecked": 0,
"liveQuickFocusState": 0,
"recommendLiveList": [],
"state": 0
},
"operationSection": {
"topJobBannerArea": None
},
"partTime": [],
"position": {
"base": {
"deliveryPath": "",
"education": "本科",
"educationCode": "",
"maxSalary": "",
"minSalary": "",
"positionId": 40829101009,
"positionName": "质量体系工程师",
"positionNumber": "CC219609310J40829101009",
"positionUrl": "",
"positionWorkingExp": "3-5年",
"positionWorkingExpCode": "",
"propertyType": "",
"salary": "1.2-1.8万",
"salaryReal": "",
"workType": "全职"
},
"date": {
"dateEnd": "",
"dateStart": "",
"firstPublishTime": "",
"positionPublishTime": "",
"positionUpdateTime": "",
"positionUpdateTimeText": ""
},
"desc": {
"description": "岗位职责:\n1、负责公司质量管理体系的策划运营维护和合规性的管理\n2、负责公司各类医疗器械质量管理体系国内GMP、ISO13485、QSR820、MDSAP、 MDR、ISO9001、 职业健康和安全,环境管理体系等)的运营、维护与升级; \n3、负责公司各类外部体系审核工作国内飞检、CE审核、飞检等的内部协调与组织工作 \n4、负责生产、经营数据上报及负责监控不良事件上报 \n5、负责相关行业法律法规的收集、整理、受控及内部传递\n6、协助质量经理做好其他部门工作。\n岗位要求\n1.本科及以上学历大学英文4级以上的阅读能力专业不限\n2.至少三年以上二类有源或三类有源医疗器械质量体系管理经验;\n3.有医疗器械内审员资格证有GCP资格证优先\n4.熟悉MDSAP或MDR2017/745优先。",
"descriptionHighlight": "",
"highlightLabels": [],
"labels": [
"医疗器械质量管理体系",
"ISO认证",
"GMP认证",
"FDA认证",
"二三类医械经验",
"医疗设备/器械"
],
"performanceBonus": "",
"welfareLabel": [],
"welfareTags": []
},
"jobType": {
"jobType": "",
"jobTypeLevel": "15000400000000",
"jobTypeLevelName": "",
"subJobType": "",
"subJobTypeLevel": "15000400170000",
"subJobTypeLevelName": ""
},
"onlineCarHailingExtend": {
"gray": False,
"promiseGuarantee": ""
},
"onlineCarInfo": [],
"other": {
"customJobGroup": "DEFAULT",
"deliveredPreviouslyTip": "",
"jobKeyword": {
"keywords": []
},
"jobSkillTags": [],
"jobTypeIsBlueCollar": False,
"overseasList": [],
"pageStyle": 0,
"positionCommercialLabel": [],
"positionHighlight": "",
"propertyTypeUrl": "",
"rpoProxyDisplayOrgName": "",
"urgentRecruitmentUrl": ""
},
"preferredHrInfo": {
"icon": "",
"introduce": "",
"jumpUrl": "",
"preferredHr": False
},
"todayInterview": None,
"workLocation": {
"address": "工作地点:宝安区 · 石岩",
"addressType": 0,
"latitude": "22.6397107496406",
"longitude": "113.92158006824309",
"positionCityDistrict": "",
"positionCityDistrictCode": "",
"positionCityId": "765",
"positionWorkCity": "",
"showMap": True,
"showMultiAddressesTip": "",
"staticMapUrl": "https://storage-public.zhaopin.cn/job/share/1772075366278481083/6e4958065143498cad2bbfc1aef40d83",
"streetName": "",
"tradingArea": "",
"travelMode": "bus",
"verifyTheTruthUrl": "",
"workAddress": "深圳宝安创维创新谷-8号楼"
}
},
"proxyWarning": None,
"recommender": {
"avatar": "",
"name": "",
"state": 0,
"text": "",
"title": ""
},
"secure": {
"abroadLabel": "",
"abroadTipInfo": None,
"safeCenter": None,
"safetyReminder": None
},
"staff": {
"activityLevel": [
"高回复率"
],
"auditNaturePrompt": None,
"authenticationState": 0,
"avatar": "https://storage-public.zhaopin.cn/zp/clouddisk/1771900452818918625/fee00116-221c-49df-a42a-6fb6f9cfcd09.png?x-oss-process=image/resize,l_240/rotate,0",
"companyName": "深圳市安保医疗科技股份有限公司",
"goldMedalInterviewer": None,
"greeting": "",
"greetingHasDelivery": "",
"hrJob": "HR",
"hrOnlineIocState": 0,
"hrOnlineState": "",
"hrResumeOperationState": "",
"hrStateInfo": "",
"id": 1168020727,
"lastOnlineTime": 0,
"lastOnlineTimeText": "",
"modularState": 1,
"other": {
"freeTag": None,
"tagUrl": ""
},
"positionDetailStaffQuickReply": None,
"staffName": "陈女士"
},
"stateInfo": {
"deliveryAfterGuide": None,
"imSessionInfoDetail": {
"imChatStatus": 0,
"imChatStatusForChatBeforeDelivery": 0,
"imDeliveryTitle": "",
"referType": -1
},
"positionBehaviorState": {
"deliveryState": 0,
"favoriteState": 0,
"followHrState": 0,
"imReplyState": -1,
"negativeState": 0,
"sessionChatState": 0
},
"signUpStatusInfo": {},
"state": {
"abroadFlag": 2,
"applyType": "",
"callProcess": "",
"hasAppliedPosition": False,
"positionDeliveryType": "",
"positionSourceType": 1,
"workMode": "ONSITE",
"workModeDesc": ""
},
"useNewAfterDeliveryStyle": False
},
"verifyTheTruth": None,
"verifyTrueFeedback": None
},
"jobHitReason": "",
"jobHitReasonHighlights": [],
"jobId": 40829101009,
"jobKeyword": {
"keywords": [
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
},
{
"itemValue": ""
}
]
},
"jobKnowledgeWelfareFeatures": [],
"jobPostingTime": 1769052217250,
"jobRootOrgInfo": {
"cityName": "深圳",
"reviewOrgNature": 1
},
"jobSkillTags": [
{
"id": 19824127,
"name": "医疗器械质量管理体系",
"standard": False
},
{
"id": 19381048,
"name": "ISO认证",
"standard": False
},
{
"id": 18958610,
"name": "GMP认证",
"standard": False
},
{
"id": 19366784,
"name": "FDA认证",
"standard": False
},
{
"id": 488082095,
"name": "二三类医械经验",
"standard": False
}
],
"jobSummary": "岗位职责1、负责公司质量管理体系的策划运营维护和合规性的管理2、负责公司各类医疗器械质量管理体系国内GMP、ISO13485、QSR820、MDSAP、 MDR、ISO9001、 职业健康和安全,环境管理体系等)的运营、维护与升级; 3、负责公司各类外部体系审核工作国内飞检、CE审核、飞检等的内部协调与组织工作 4、负责生产、经营数据上报及负责监控不良事件上报 5、负责相关行业...",
"liveCard": {
"icon": "",
"liveState": 0,
"liveTips": "",
"roomId": 0,
"startTimeFormat": "",
"videoUrl": ""
},
"matchInfo": {
"icon": "",
"matched": 0,
"tagState": 0
},
"menVipLevel": 0,
"name": "质量体系工程师",
"needMajor": [],
"number": "CC219609310J40829101009",
"orgBestEmployerFlag": 1,
"orgCommercialTags": [],
"orgPayedFlag": 1,
"payload": {
"name": "",
"partition": "",
"score": "",
"weight": ""
},
"positionCommercialLabel": [],
"positionExpandCardData": "",
"positionExpandCardType": 0,
"positionHighlight": "",
"positionOfNlp": 1,
"positionSourceType": 1,
"positionSourceTypeUrl": "",
"positionURL": "http://jobs.zhaopin.com/CC219609310J40829101009.htm",
"positionUrl": "http://jobs.zhaopin.com/CC219609310J40829101009.htm",
"property": "民营",
"propertyCode": "5",
"propertyName": "民营",
"propertyType": "",
"propertyTypeUrl": "",
"provideInternshipCertificate": False,
"proxyModel": {
"proxiedOrgName": "",
"proxiedOrgSize": "",
"recruitPosition": 0
},
"publishTime": "2026-03-05 09:51:08",
"recallSign": {
"gMethod": "config-position_search-position_mbscore-ANONYMOUS-welfare-words",
"gParam": "query-ps-mbscore-3",
"gQuery": "query-ps-mbscore-3",
"gSort": "query-ps-mbscore-3",
"gSource": "solr.source_position_query",
"gWeight": 0
},
"recruitNumber": 1,
"redirectUrl": "",
"redirectable": False,
"rootCompanyNumber": "CZ219609310",
"rpoProxied": False,
"rpoProxy": False,
"salary60": "1.2-1.8万",
"salaryCount": "",
"salaryReal": "12001-18000",
"salaryType": 1,
"searchTagList": [],
"securityAddressLabel": "",
"settlementType": "",
"showDistance": 0,
"showSkillTags": [
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "3-5年"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "本科"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "医疗器械质量管理体系"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "ISO认证"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "GMP认证"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "FDA认证"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "二三类医械经验"
},
{
"highlightBackGroundColor": "",
"highlightWordColor": "",
"tag": "医疗设备/器械"
}
],
"skillLabel": [
{
"state": 0,
"value": "医疗器械质量管理体系"
},
{
"state": 0,
"value": "ISO认证"
},
{
"state": 0,
"value": "GMP认证"
},
{
"state": 0,
"value": "FDA认证"
},
{
"state": 0,
"value": "二三类医械经验"
}
],
"skillLabelPersonality": "",
"staffCard": {
"authenticationState": 0,
"avatar": "https://storage-public.zhaopin.cn/zp/clouddisk/1771900452818918625/fee00116-221c-49df-a42a-6fb6f9cfcd09.png?x-oss-process=image/resize,l_240/rotate,0",
"goldMedalInterviewer": {
"goldMedalInterviewer": False,
"interviewerImageUrl": "",
"interviewerTitle": ""
},
"hrCompanyName": "",
"hrJob": "HR",
"hrOnlineIocState": 0,
"hrOnlineState": "三日内活跃",
"hrStateInfo": "高回复率",
"id": 1168020727,
"lastOnlineTime": 1774084085367,
"lastOnlineTimeText": "",
"staffName": "陈女士"
},
"streetId": 44030605,
"streetName": "石岩",
"subJobTypeLevel": "15000400170000",
"subJobTypeLevelName": "质量体系工程师",
"subways": [],
"tagABC": "",
"tagList": [],
"todayInterview": False,
"todayInterviewImageUrl": "",
"topLabel": None,
"tradingArea": "",
"volcanoMeterial": None,
"weeklyInternshipDays": 0,
"welfareLabel": [],
"welfareTagList": [],
"workCity": "深圳",
"workDateType": "",
"workMode": "",
"workType": "全职",
"workTypeCode": "2",
"workingExp": "3-5年"
}
}

2683
aaa.txt Normal file

File diff suppressed because it is too large Load Diff

197
app/CLAUDE.md Normal file
View File

@ -0,0 +1,197 @@
[根目录](../CLAUDE.md) > **app**
# app - FastAPI 后端模块
## 模块职责
提供 JobData 平台的 REST API 服务,包含:用户/角色/权限/菜单/部门管理RBAC招聘数据入库、查询、清洗与分析Token 与代理 IP 管理,定时任务调度,以及审计日志记录。
---
## 入口与启动
| 文件 | 说明 |
|------|------|
| `run.py`(根目录) | `uvicorn` 启动入口,读取 `APP_HOST`/`APP_PORT`/`UVICORN_WORKERS` 环境变量 |
| `app/__init__.py` | FastAPI 应用工厂 `create_app()`,注册中间件、异常处理器、路由,以及 lifespan 钩子 |
| `app/core/init_app.py` | lifespan 内部逻辑DB 迁移、种子数据、ClickHouse 初始化 |
| `app/core/scheduler.py` | APScheduler 启动与任务注册 |
### 启动顺序
1. Tortoise-ORM 连接 MySQL生成 schema
2. 按环境变量执行数据库迁移Aerich
3. 初始化种子数据超级管理员、菜单、API、角色
4. 初始化 ClickHouse 表/视图(可选)
5. APScheduler 启动定时任务
6. FastAPI 开始接受请求
---
## 对外接口
API 前缀:`/api/v1`,完整路由注册见 `app/api/v1/__init__.py`
| 路由前缀 | 标签 | 权限 | 说明 |
|----------|------|------|------|
| `/base` | 基础模块 | 无 | 登录、获取用户信息、菜单树 |
| `/user` | 用户管理 | DependPermission | 用户 CRUD |
| `/role` | 角色管理 | DependPermission | 角色 CRUD、菜单/API 分配 |
| `/menu` | 菜单管理 | DependPermission | 菜单树 CRUD |
| `/api` | API 管理 | DependPermission | 接口注册与权限管理 |
| `/dept` | 部门管理 | DependPermission | 部门树 CRUD |
| `/auditlog` | 审计日志 | DependPermission | 操作日志查询 |
| `/job` & `/universal` | 数据入库/通用数据接口 | 无鉴权(内部调用) | 职位/公司数据批量入库 |
| `/token` | Token 管理 | 无鉴权 | Boss Token CRUD |
| `/proxy` | 代理 IP 管理 | DependPermission | 代理池管理 |
| `/stats` | 数据统计 | 无 | 各平台数据量统计 |
| `/pipeline` | 流水线 | 无 | 触发 ECS pipeline |
| `/keyword` | 关键词管理 | 无 | 爬虫关键词(城市+职位)管理 |
| `/cleaning` | 数据清理 | DependPermission | 定向清洗操作 |
| `/analytics` | 数据分析 | 无 | 趋势、来源分布统计 |
| `/company` | 公司搜索 | 无 | 公司信息查询 |
**认证机制**JWTHS256有效期 7 天),通过 `DependPermission` 依赖注入检查路由级别权限。
---
## 关键依赖与配置
配置集中在 `app/settings/config.py``pydantic-settings.BaseSettings`,支持环境变量覆盖):
```python
# 关键字段(需通过环境变量覆盖)
SECRET_KEY = "CHANGE_ME_DEV_ONLY" # JWT 签名密钥
TORTOISE_ORM.connections.default # MySQL 连接串(含密码)
CLICKHOUSE_HOST / USER / PASS # ClickHouse 连接
SMTP_USER / SMTP_PASS # 邮件凭据
```
**中间件链**(从外到内):
1. `CORSMiddleware` - 跨域(默认允许 `http://localhost:5173`
2. `BackGroundTaskMiddleware` - 后台任务支持
3. `HttpAuditLogMiddleware` - HTTP 审计日志(排除登录接口)
4. `IpTrackingMiddleware` - IP 请求追踪
---
## 数据模型
### MySQLTortoise-ORM
| 表 | 模型文件 | 说明 |
|----|----------|------|
| `user` | `app/models/admin.py` | 用户(含角色多对多) |
| `role` | `app/models/admin.py` | 角色含菜单、API 多对多) |
| `api` | `app/models/admin.py` | 接口注册表 |
| `menu` | `app/models/admin.py` | 菜单树parent_id 自引用) |
| `dept` | `app/models/admin.py` | 部门树 + 闭包表 |
| `auditlog` | `app/models/admin.py` | HTTP 操作审计 |
| `boss_token` | `app/models/token.py` | Boss 直聘登录 Token |
| `cleaning_*` | `app/models/cleaning.py` | 数据清洗任务状态 |
| `scheduled_task_run` / `stats_total` | `app/models/metrics.py` | 定时任务运行记录与统计汇总 |
### ClickHouse原始数据存储
| 表/视图 | 引擎 | 说明 |
|---------|------|------|
| `boss_job` | MergeTree | Boss 职位原始 JSON`job_id` 去重 |
| `boss_company` | MergeTree | Boss 公司原始 JSON`company_name` 去重 |
| `qcwy_job` | MergeTree | 前程无忧职位,`job_id + update_date_time` 去重 |
| `qcwy_company` | MergeTree | 前程无忧公司 |
| `zhilian_job` | MergeTree | 智联招聘职位,`number + first_publish_time` 去重 |
| `zhilian_company` | MergeTree | 智联招聘公司 |
| `pending_company` | ReplacingMergeTree | 待处理公司队列,`(source, company_id)` 去重 |
| `job_analytics` | VIEW | 三平台统一分析视图UNION ALL |
ClickHouse 表结构在 `app/core/clickhouse_init.py` 中通过 `CREATE TABLE IF NOT EXISTS` 管理。
---
## 核心服务
| 服务文件 | 职责 |
|----------|------|
| `app/services/cleaning.py` | `CleaningService`多平台定向清洗URL/ID/公司名/公司ID自动识别平台 |
| `app/services/company_cleaner.py` | 公司数据自动清洗collect 待处理 → process → 入库 |
| `app/services/analytics_service.py` | `AnalyticsService`:封装 ClickHouse 分析查询 |
| `app/services/job.py` | `DataRouterService`:数据路由入库(去重逻辑) |
| `app/services/ingest_service.py` | 批量数据摄入 |
| `app/services/crawler/boss.py` | Boss 爬虫 Service 封装HTTP 层) |
| `app/services/crawler/qcwy.py` | 前程无忧爬虫 Service |
| `app/services/crawler/zhilian.py` | 智联招聘爬虫 Service |
| `app/repositories/clickhouse_repo.py` | ClickHouse Repository`ClickHouseBaseRepo` + `JobAnalyticsRepo` |
| `app/core/scheduler.py` | 定时任务stats、ip_alert、ecs_pipeline、company_cleaning、daily_cleanup |
| `app/core/locks.py` | `DistributedLock`:基于文件/Redis 的分布式锁,防多 Worker 重复执行 |
| `app/core/algorithms/antispider.py` | 反爬虫算法(签名生成等) |
---
## 测试与质量
- 当前无测试文件,属于主要缺口。
- 代码质量工具:`ruff`lint`black`(格式)、`isort`(导入排序)。
- 建议优先补充的测试:
- `CleaningService.clean_target_auto()` 的平台识别逻辑
- `DataRouterService.store_data()` 的去重逻辑
- `app/api/v1/analytics.py` 接口集成测试
---
## 常见问题 (FAQ)
**Q: 启动报 ClickHouse 连接失败?**
A: 检查 `CLICKHOUSE_HOST` 环境变量,或在 `config.py` 中将 `CLICKHOUSE_HOST` 置为空字符串跳过初始化。
**Q: 多 Worker 下任务重复执行?**
A: 通过文件锁(`.startup_lock` 目录)和 `DistributedLock` 保护,若 Worker 异常退出可能导致锁残留,手动删除 `.startup_lock` 目录即可。
**Q: 新增 API 接口后权限不生效?**
A: 在路由文件中注册路由后,重启应用会触发 `api_controller.refresh_api()` 自动扫描 FastAPI 路由表并更新 `api` 表,然后在角色管理中分配权限。
---
## 相关文件清单
```
app/
├── __init__.py # 应用工厂 create_app()
├── settings/config.py # 全局配置Settings
├── api/v1/__init__.py # 路由聚合
├── api/v1/analytics.py # 数据分析接口
├── api/v1/cleaning/ # 数据清理接口
├── api/v1/job/ # 数据入库接口
├── api/v1/keyword/ # 关键词管理接口
├── api/v1/company/ # 公司搜索接口
├── controllers/ # 业务控制器CRUD 封装)
├── core/
│ ├── init_app.py # lifespan 初始化
│ ├── scheduler.py # APScheduler 定时任务
│ ├── clickhouse.py # ClickHouse 连接管理
│ ├── clickhouse_init.py # ClickHouse 表/视图 DDL
│ ├── locks.py # 分布式锁
│ ├── middlewares.py # 中间件
│ └── algorithms/ # 签名/反爬虫算法
├── models/
│ ├── admin.py # User, Role, Api, Menu, Dept, AuditLog
│ ├── token.py # BossToken
│ ├── metrics.py # ScheduledTaskRun, StatsTotal
│ └── cleaning.py # 清洗任务状态
├── repositories/
│ └── clickhouse_repo.py # ClickHouse 查询仓库
├── services/
│ ├── cleaning.py # CleaningService
│ ├── company_cleaner.py # 公司自动清洗
│ ├── analytics_service.py # 数据分析 Service
│ ├── job.py # DataRouterService数据入库路由
│ └── crawler/ # 各平台爬虫 Service 封装
└── schemas/ # Pydantic 请求/响应 Schema
```
---
## 变更记录 (Changelog)
| 日期 | 说明 |
|------|------|
| 2026-03-20 | 初始化模块文档 |

View File

@ -14,6 +14,7 @@ from app.core.init_app import (
)
from app.core.clickhouse import clickhouse_manager
from app.core.scheduler import start_scheduler, shutdown_scheduler
from app.services.ingest.remote_push import close_http_client
try:
from app.settings.config import settings
@ -28,7 +29,8 @@ async def lifespan(app: FastAPI):
await init_data()
start_scheduler()
yield
# 清理所有数据库连接
# 清理所有连接
await close_http_client()
await Tortoise.close_connections()
await clickhouse_manager.close()
shutdown_scheduler()

View File

@ -29,8 +29,9 @@ v1_router.include_router(menus_router, prefix="/menu", dependencies=[DependPermi
v1_router.include_router(apis_router, prefix="/api", dependencies=[DependPermission])
v1_router.include_router(depts_router, prefix="/dept", dependencies=[DependPermission])
v1_router.include_router(auditlog_router, prefix="/auditlog", dependencies=[DependPermission])
v1_router.include_router(job_router, prefix="/ingest", tags=["数据入库"])
v1_router.include_router(job_router, prefix="/job", tags=["数据入库"])
v1_router.include_router(job_router, prefix="/universal", tags=["通用数据接口"])
v1_router.include_router(job_router, prefix="/universal", tags=["数据入库"])
v1_router.include_router(token_router, prefix="/token", tags=["Token管理"])
v1_router.include_router(proxy_router, prefix="/proxy", tags=["代理IP管理"])
v1_router.include_router(stats_router, prefix="/stats")

View File

@ -1,9 +1,6 @@
from typing import Optional, List
from typing import Optional
from datetime import datetime, date, timezone
try:
from zoneinfo import ZoneInfo
except ImportError:
from backports.zoneinfo import ZoneInfo
from zoneinfo import ZoneInfo
from fastapi import APIRouter, Depends, Query
from app.core.clickhouse import clickhouse_manager
@ -30,24 +27,28 @@ async def get_overview(
from_date: Optional[date] = None,
to_date: Optional[date] = None,
city: Optional[str] = None,
channel: Optional[str] = None,
service: AnalyticsService = Depends(get_analytics_service)
):
from_dt = to_utc(datetime.combine(from_date, datetime.min.time())) if from_date else None
to_dt = to_utc(datetime.combine(to_date, datetime.max.time())) if to_date else None
filters = {}
if city:
filters["city"] = city
if channel:
filters["channel"] = channel
return await service.get_job_statistics(filters=filters, from_dt=from_dt, to_dt=to_dt)
@router.get("/trend/volume", summary="获取数据量趋势")
async def get_volume_trend(
interval: str = Query("day", regex="^(day|hour|week|month)$"),
interval: str = Query("day", pattern="^(day|hour|week|month)$"),
from_date: Optional[date] = None,
to_date: Optional[date] = None,
from_datetime: Optional[datetime] = None,
to_datetime: Optional[datetime] = None,
channel: Optional[str] = None,
service: AnalyticsService = Depends(get_analytics_service)
):
# 兼容小时粒度的精确时间窗口,其它粒度按日期转换为起止时间
@ -65,7 +66,10 @@ async def get_volume_trend(
else:
to_dt = None
return await service.get_volume_trend(interval=interval, from_dt=from_dt, to_dt=to_dt)
filters = {}
if channel:
filters["channel"] = channel
return await service.get_volume_trend(interval=interval, filters=filters, from_dt=from_dt, to_dt=to_dt)
@router.get("/distribution/source", summary="获取数据来源分布")
async def get_source_distribution(
@ -73,6 +77,7 @@ async def get_source_distribution(
to_date: Optional[date] = None,
from_datetime: Optional[datetime] = None,
to_datetime: Optional[datetime] = None,
channel: Optional[str] = None,
service: AnalyticsService = Depends(get_analytics_service)
):
if from_datetime:
@ -89,4 +94,7 @@ async def get_source_distribution(
else:
to_dt = None
return await service.get_source_distribution(from_dt=from_dt, to_dt=to_dt)
filters = {}
if channel:
filters["channel"] = channel
return await service.get_source_distribution(filters=filters, from_dt=from_dt, to_dt=to_dt)

View File

@ -1,59 +1,91 @@
from datetime import datetime
import json
from typing import Optional
from fastapi import APIRouter, File, UploadFile, Form, Body, Query
from tortoise.expressions import Q
from tortoise.functions import Count, Sum
from app.models.cleaning import CleaningTask
from app.models.company import CompanyCleaningQueue
from app.schemas import Success, SuccessExtra
from app.services.cleaning import CleaningService
from app.services.company_cleaner import company_cleaner
from app.services.company_storage import company_storage, normalize_company_id
from app.controllers.cleaning import cleaning_controller
from app.schemas import Success, SuccessExtra
from app.models.cleaning import CleaningTask
from app.core.clickhouse import clickhouse_manager
from tortoise.expressions import Q
from typing import Optional
import json
router = APIRouter()
cleaning_service = CleaningService()
COMPANY_SOURCES = ("boss", "qcwy", "zhilian")
@router.get("/stats", summary="获取公司清洗统计信息")
async def get_stats():
"""获取 ClickHouse 中待处理公司的统计信息"""
client = await clickhouse_manager.get_client()
pending_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'pending'"
pending_res = await client.query(pending_sql)
pending_count = pending_res.result_rows[0][0] if pending_res.result_rows else 0
today_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'done' AND toDate(updated_at) = today()"
today_res = await client.query(today_sql)
today_count = today_res.result_rows[0][0] if today_res.result_rows else 0
dist_sql = """
SELECT source, status, count()
FROM job_data.pending_company FINAL
GROUP BY source, status
ORDER BY source, status
"""
dist_res = await client.query(dist_sql)
"""获取 MySQL 中待处理公司的统计信息"""
pending_count = await CompanyCleaningQueue.filter(status="pending").count()
today_count = await CompanyCleaningQueue.filter(
status="done",
updated_at__gte=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0),
).count()
dist_rows = await CompanyCleaningQueue.annotate(total=Count("id")).group_by("source", "status").values(
"source",
"status",
"total",
)
stats = {
"total_pending": pending_count,
"today_processed": today_count,
"details": []
}
# Process distribution
source_stats = {}
for row in dist_res.result_rows:
source, status, count = row
source_stats = {
source: {
"pending": 0,
"processing": 0,
"done": 0,
"failed": 0,
"total": 0,
"jobs_fetched": 0,
"jobs_stored": 0,
"jobs_duplicate": 0,
"jobs_failed": 0,
}
for source in COMPANY_SOURCES
}
for row in dist_rows:
source = row["source"]
status = row["status"]
count = row["total"]
if source not in source_stats:
source_stats[source] = {"pending": 0, "done": 0, "failed": 0, "total": 0}
continue
if status in source_stats[source]:
source_stats[source][status] = count
source_stats[source]["total"] += count
stats["details"] = [
{"source": k, **v} for k, v in source_stats.items()
]
job_dist_rows = await CompanyCleaningQueue.annotate(
jobs_fetched_sum=Sum("jobs_fetched"),
jobs_stored_sum=Sum("jobs_stored"),
jobs_duplicate_sum=Sum("jobs_duplicate"),
jobs_failed_sum=Sum("jobs_failed"),
).group_by("source").values(
"source",
"jobs_fetched_sum",
"jobs_stored_sum",
"jobs_duplicate_sum",
"jobs_failed_sum",
)
for row in job_dist_rows:
source = row["source"]
if source not in source_stats:
continue
source_stats[source]["jobs_fetched"] = int(row["jobs_fetched_sum"] or 0)
source_stats[source]["jobs_stored"] = int(row["jobs_stored_sum"] or 0)
source_stats[source]["jobs_duplicate"] = int(row["jobs_duplicate_sum"] or 0)
source_stats[source]["jobs_failed"] = int(row["jobs_failed_sum"] or 0)
stats["details"] = [{"source": k, **v} for k, v in source_stats.items()]
return Success(data=stats)
@ -65,44 +97,44 @@ async def get_companies_list(
status: Optional[str] = Query(None)
):
"""分页获取待处理公司列表详情"""
client = await clickhouse_manager.get_client()
offset = (page - 1) * page_size
where_clauses = []
VALID_SOURCES = {"boss", "qcwy", "zhilian"}
VALID_STATUSES = {"pending", "processing", "done", "failed"}
queue_query = CompanyCleaningQueue.all()
if source:
where_clauses.append(f"source = '{source}'")
if source not in VALID_SOURCES:
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
queue_query = queue_query.filter(source=source)
if status:
where_clauses.append(f"status = '{status}'")
where_sql = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
# Count
count_sql = f"SELECT count() FROM job_data.pending_company FINAL {where_sql}"
count_res = await client.query(count_sql)
total = count_res.result_rows[0][0] if count_res.result_rows else 0
# Data
sql = f"""
SELECT source, company_id, company_name, status, error_msg, created_at, updated_at
FROM job_data.pending_company FINAL
{where_sql}
ORDER BY updated_at DESC
LIMIT {page_size} OFFSET {offset}
"""
res = await client.query(sql)
if status not in VALID_STATUSES:
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
queue_query = queue_query.filter(status=status)
total = await queue_query.count()
rows = await queue_query.order_by("-updated_at").offset(offset).limit(page_size)
data = []
for row in res.result_rows:
for row in rows:
data.append({
"source": row[0],
"company_id": row[1],
"company_name": row[2],
"status": row[3],
"error_msg": row[4],
"created_at": row[5].isoformat() if row[5] else None,
"updated_at": row[6].isoformat() if row[6] else None
"source": row.source,
"company_id": row.company_id,
"company_name": row.company_name,
"status": row.status,
"error_msg": row.error_msg,
"retry_count": row.retry_count,
"started_at": row.started_at.isoformat() if row.started_at else None,
"finished_at": row.finished_at.isoformat() if row.finished_at else None,
"jobs_fetched": row.jobs_fetched,
"jobs_stored": row.jobs_stored,
"jobs_duplicate": row.jobs_duplicate,
"jobs_failed": row.jobs_failed,
"jobs_error_msg": row.jobs_error_msg,
"created_at": row.created_at.isoformat() if row.created_at else None,
"updated_at": row.updated_at.isoformat() if row.updated_at else None
})
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@ -112,61 +144,30 @@ async def get_company_cleaning_detail(
company_id: str = Query(..., description="公司ID"),
company_name: Optional[str] = Query(None, description="公司名称"),
):
client = await clickhouse_manager.get_client()
table_map = {
"boss": "boss_company",
"qcwy": "qcwy_company",
"zhilian": "zhilian_company",
}
table = table_map.get(source)
if not table:
if source not in {"boss", "qcwy", "zhilian"}:
return Success(code=400, msg="不支持的数据源")
if source == "qcwy":
sql = f"""
SELECT json_data, company_name, created_at, updated_at
FROM job_data.{table}
WHERE JSONExtractString(json_data, 'companyId') = {{company_id:String}}
OR JSONExtractString(json_data, 'coId') = {{company_id:String}}
OR JSONExtractString(json_data, 'coinfo', 'coid') = {{company_id:String}}
OR company_name = {{company_name:String}}
ORDER BY updated_at DESC
LIMIT 1
"""
params = {
"company_id": str(company_id),
"company_name": str(company_name or ""),
}
else:
if not company_name:
return Success(code=400, msg="缺少公司名称")
sql = f"""
SELECT json_data, company_name, created_at, updated_at
FROM job_data.{table}
WHERE company_name = {{company_name:String}}
ORDER BY updated_at DESC
LIMIT 1
"""
params = {"company_name": str(company_name)}
print(f"DEBUG: Executing SQL: {sql}")
print(f"DEBUG: Params: {params}")
res = await client.query(sql, parameters=params)
if not res.result_rows:
model = company_storage.company_model(source)
normalized_id = normalize_company_id(source, company_id)
row = await model.get_or_none(source_company_id=normalized_id)
if not row and company_name:
row = await model.get_or_none(company_name=company_name)
if not row:
return Success(code=404, msg="未找到公司清洗结果")
row = res.result_rows[0]
raw_json = row[0]
try:
data = json.loads(raw_json)
except Exception:
data = {"raw": raw_json}
data = row.raw_json
if isinstance(data, str):
try:
data = json.loads(data)
except Exception:
data = {"raw": data}
return Success(
data={
"source": source,
"company_id": company_id,
"company_name": row[1],
"created_at": row[2].isoformat() if row[2] else None,
"updated_at": row[3].isoformat() if row[3] else None,
"company_id": row.source_company_id,
"company_name": row.company_name,
"created_at": row.created_at.isoformat() if row.created_at else None,
"updated_at": row.updated_at.isoformat() if row.updated_at else None,
"data": data,
}
)
@ -178,10 +179,13 @@ async def collect_pending_companies_api(
source: Optional[str] = Body(None, embed=True)
):
"""
分析招聘数据收集待处理的公司ID到 pending_company
分析招聘数据收集待处理的公司ID到 MySQL 队列
"""
await company_cleaner.collect_pending_companies(limit=limit, source=source)
return Success(msg=f"已完成数据分析,已收集待处理公司(上限 {limit} 条)")
summary = await company_cleaner.collect_pending_companies(limit=limit, source=source)
return Success(
msg=f"已完成数据分析,本次新增 {summary['total_created']} 条待处理公司",
data=summary,
)
@router.post("/run-pending", summary="手动执行待处理公司清洗")
@ -212,7 +216,7 @@ async def crawl_execute_pending(
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
await company_cleaner.collect_pending_companies(source=source)
await company_cleaner.collect_pending_companies(limit=limit, source=source)
await company_cleaner.process_pending_companies(
limit=limit,
source=source,
@ -222,6 +226,40 @@ async def crawl_execute_pending(
return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务")
@router.post("/update-company-status", summary="更新公司爬取状态(爬虫端调用)")
async def update_company_status(
source: str = Body(..., embed=True),
company_id: str = Body(..., embed=True),
status: str = Body(..., embed=True),
error_message: str = Body("", embed=True),
):
"""爬虫完成公司数据抓取后,调用此接口更新 MySQL 队列状态"""
VALID_STATUSES = {"done", "failed"}
if status not in VALID_STATUSES:
return Success(msg=f"无效状态: {status},仅支持 {VALID_STATUSES}", code=400)
normalized_id = normalize_company_id(source, company_id)
queue, _ = await CompanyCleaningQueue.get_or_create(
source=source,
company_id=normalized_id,
defaults={
"company_name": "",
"status": "pending",
"error_msg": "",
"retry_count": 0,
"started_at": None,
"finished_at": None,
},
)
queue.status = status
queue.error_msg = error_message or ""
queue.finished_at = datetime.now()
if status == "failed":
queue.retry_count += 1
await queue.save()
return Success(msg="状态更新成功", data={"source": source, "company_id": company_id, "status": status})
@router.post("/process-company", summary="执行单个公司清洗任务")
async def process_single_company_api(
source: str = Body(..., embed=True),

View File

@ -1,46 +0,0 @@
from typing import Optional, List, Dict, Any
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, Field
from clickhouse_connect.driver import AsyncClient
from app.core.clickhouse import clickhouse_manager
from app.services.ingest_service import IngestService
router = APIRouter()
class IngestSingleRequest(BaseModel):
platform: str = Field(...)
data_type: str = Field(...)
data: Dict[str, Any] = Field(...)
check_duplicate: bool = Field(True)
class IngestBatchRequest(BaseModel):
platform: str = Field(...)
data_type: str = Field(...)
data_list: List[Dict[str, Any]] = Field(...)
check_duplicate: bool = Field(True)
async def get_service() -> IngestService:
client: AsyncClient = await clickhouse_manager.get_client()
return IngestService(client)
@router.post("/data")
async def ingest_data(req: IngestSingleRequest, service: IngestService = Depends(get_service)):
try:
res = await service.store_single(req.platform, req.data_type, req.data, req.check_duplicate)
return {"code": 200, "data": res, "message": "ok"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/batch")
async def ingest_batch(req: IngestBatchRequest, service: IngestService = Depends(get_service)):
try:
res = await service.store_batch(req.platform, req.data_type, req.data_list, req.check_duplicate)
return {"code": 200, "data": res, "message": "ok"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -3,6 +3,6 @@ from fastapi import APIRouter
from .job import router
job_router = APIRouter()
job_router.include_router(router, tags=["数据上报"])
job_router.include_router(router, tags=["数据入库"])
__all__ = ["job_router"]

View File

@ -5,7 +5,12 @@ from pydantic import BaseModel, Field
from app.controllers.keyword import KeywordController
from app.core.dependency import DependPermission
from app.schemas.keyword import KeywordCreate, KeywordUpdate
from app.schemas.keyword import (
CrawlCompleteRequest,
KeywordCreate,
KeywordUpdate,
PageProgressRequest,
)
router = APIRouter(tags=["关键词接口"])
@ -34,18 +39,14 @@ async def get_available(
source: str,
limit: int = 1,
reserve: bool = True,
crawler_id: str = "",
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""根据平台获取当天未使用的检索条件
参数:
source: 平台标识boss|qcwy|zhilian
limit: 返回数量上限默认1
返回:
标准字典结构包含 items/total/limit
优先级: partial(断点续爬) > failed(重试) > 全新关键词
"""
return await controller.get_available(source, limit, reserve)
return await controller.get_available(source, limit, reserve, crawler_id)
@router.post("/mark-used", summary="将检索条件标记为今日已使用")
@ -180,3 +181,32 @@ async def delete_keyword(
删除结果
"""
return await controller.delete_keyword(source, id)
@router.post("/page-progress", summary="爬虫汇报单页爬取进度")
async def report_page_progress(
request: PageProgressRequest,
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""爬虫每完成一页后调用此接口汇报进度"""
return await controller.report_page_progress(
request.source,
request.keyword_id,
request.page,
request.total_pages,
request.jobs_found,
)
@router.post("/crawl-complete", summary="爬虫汇报爬取完成或失败")
async def report_crawl_complete(
request: CrawlCompleteRequest,
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""爬虫完成或失败后调用此接口更新状态"""
return await controller.report_crawl_complete(
request.source,
request.keyword_id,
request.status,
request.error_message,
)

View File

@ -1,13 +1,12 @@
import logging
import time
from typing import Any, Dict, Tuple
from fastapi import APIRouter, Query, Body, Path, BackgroundTasks
from fastapi.background import P
from typing import Any, Dict, Optional, Tuple
from fastapi import APIRouter, Query, Body, Path
from tortoise.expressions import Q
from app.controllers.token import token_controller
from app.schemas.base import Fail, Success, SuccessExtra
from app.schemas.token import BossTokenUpdate,BossTokenCreate
from app.schemas.base import Success, SuccessExtra
from app.schemas.token import BossTokenUpdate, BossTokenCreate
logger = logging.getLogger(__name__)
@ -15,7 +14,7 @@ token_router = APIRouter()
# 简单内存缓存key 为查询参数组合value 为 (缓存时间戳, 响应数据)
_TOKENS_CACHE: Dict[Tuple[Any, Any, int, int], Tuple[float, Dict[str, Any]]] = {}
_CACHE_TTL_SECONDS: int =60
_CACHE_TTL_SECONDS: int = 60
@token_router.get("/tokens", summary="获取Boss Token列表")
@ -23,18 +22,30 @@ async def list_boss_tokens(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
status: int = Query(None, description="状态筛选"),
wt2: Optional[str] = Query(None),
mpt: Optional[str] = Query(None),
):
"""获取Boss Token列表"""
from tortoise.expressions import Q
"""获取Boss Token列表带缓存"""
cache_key: Tuple[Any, Any, int, int] = (wt2, mpt, page, page_size)
now = time.monotonic()
cached = _TOKENS_CACHE.get(cache_key)
if cached and (now - cached[0] < _CACHE_TTL_SECONDS):
return cached[1]
q = Q()
if status is not None:
q &= Q(status=status)
if wt2:
q &= Q(wt2__icontains=wt2)
if mpt:
q &= Q(mpt__icontains=mpt)
total, token_objs = await token_controller.get_tokens(page=page, page_size=page_size, search=q)
data = [await obj.to_dict() for obj in token_objs]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
resp = SuccessExtra(data=data, total=total, page=page, page_size=page_size)
_TOKENS_CACHE[cache_key] = (now, resp)
return resp
@token_router.get("/tokens/{token_id}", summary="获取Boss Token详情")
@ -53,7 +64,6 @@ async def create_boss_token(
):
"""创建Boss Token"""
await token_controller.create_token(token_data)
# 清空缓存,确保新数据立即生效
_TOKENS_CACHE.clear()
return Success(msg="创建成功")
@ -65,7 +75,6 @@ async def update_boss_token(
):
"""更新Boss Token"""
await token_controller.update_token(token_id, token_data)
# 清空缓存,确保更新立即生效
_TOKENS_CACHE.clear()
return Success(msg="更新成功")
@ -76,7 +85,6 @@ async def delete_boss_token(
):
"""删除Boss Token"""
await token_controller.delete_token(token_id)
# 清空缓存,确保删除立即生效
_TOKENS_CACHE.clear()
return Success(msg="删除成功")
@ -84,104 +92,7 @@ async def delete_boss_token(
@token_router.post("/tokens/cache/clear", summary="强制清除Token缓存")
async def clear_token_cache():
"""强制清除Token列表缓存"""
global _TOKENS_CACHE
cache_size = len(_TOKENS_CACHE)
_TOKENS_CACHE.clear()
logger.info(f"手动清除Token缓存清除了 {cache_size} 条缓存数据")
return Success(msg=f"成功清除 {cache_size} 条Token缓存")
from typing import Optional, Dict, Any
from fastapi import APIRouter, Query, HTTPException
from tortoise.transactions import in_transaction
from app.models.token import BossToken
from app.schemas.base import Success
token_router = APIRouter()
@token_router.get("/tokens")
async def list_tokens(
wt2: Optional[str] = Query(None),
mpt: Optional[str] = Query(None),
page: int = Query(1, ge=1),
page_size: int = Query(10, ge=1, le=200),
):
"""获取 BossToken 列表,带两小时内存缓存。
Args:
wt2 (Optional[str]): `wt2` 模糊匹配
mpt (Optional[str]): `mpt` 模糊匹配
page (int): 页码
page_size (int): 每页数量
Returns:
Dict[str, Any]: 响应字典包含 `code``data``total`
"""
cache_key: Tuple[Any, Any, int, int] = (wt2, mpt, page, page_size)
now = time.monotonic()
cached = _TOKENS_CACHE.get(cache_key)
if cached and (now - cached[0] < _CACHE_TTL_SECONDS):
return cached[1]
qs = BossToken.all()
if wt2:
qs = qs.filter(wt2__icontains=wt2)
if mpt:
qs = qs.filter(mpt__icontains=mpt)
total = await qs.count()
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
data = [
{
"id": item.id,
"wt2": item.wt2,
"mpt": item.mpt,
"is_active": item.is_active,
"failed_count": item.failed_count,
"last_used_time": item.last_used_time,
"created_at": item.created_at,
}
for item in items
]
resp: Dict[str, Any] = {"code": 200, "data": data, "total": total}
_TOKENS_CACHE[cache_key] = (now, resp)
return resp
@token_router.post("/tokens")
async def create_token(payload: Dict[str, Any]):
try:
async with in_transaction():
item = await BossToken.create(
wt2=payload.get("wt2"),
mpt=payload.get("mpt"),
is_active=bool(payload.get("is_active", True)),
failed_count=int(payload.get("failed_count", 0)),
last_used_time=payload.get("last_used_time"),
)
_TOKENS_CACHE.clear()
return Success(data={"id": item.id})
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@token_router.put("/tokens/{id}")
async def update_token(id: int, payload: Dict[str, Any]):
token_id = id
item = await BossToken.get_or_none(id=token_id)
if not item:
raise HTTPException(status_code=404, detail="Token not found")
for field in ["wt2", "mpt", "is_active", "failed_count", "last_used_time"]:
if field in payload:
setattr(item, field, payload[field])
await item.save()
_TOKENS_CACHE.clear()
return Success(data={"id": item.id})
@token_router.delete("/tokens/{token_id}")
async def delete_token(token_id: int):
item = await BossToken.get_or_none(id=token_id)
if not item:
raise HTTPException(status_code=404, detail="Token not found")
await item.delete()
_TOKENS_CACHE.clear()
return Success(data={"id": token_id})

View File

@ -1,19 +1,44 @@
"""
公司搜索控制器 使用新 crawler service 替代已删除的 company_spider
"""
import asyncio
from typing import Any, Dict, List, Optional
from company_spider.qcwy_company.spider import search_company as qcwy_search_company
from company_spider.zhilianzhaopin_company.spider import crawl_companies
from loguru import logger
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
class CompanyController:
async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
return await asyncio.to_thread(qcwy_search_company, keyword)
def __init__(self):
self._qcwy = QcwyService()
self._zhilian = ZhilianService()
async def search_zhilian_company(self, keyword: str, city: Optional[str] = None) -> List[Dict[str, Any]]:
params = {"kw": keyword}
if city:
params["city"] = city
return await asyncio.to_thread(crawl_companies, params, 10)
async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
try:
return await asyncio.to_thread(self._qcwy.get_company_info, keyword)
except Exception as e:
logger.error(f"Qcwy company search failed: {e}")
return None
async def search_zhilian_company(
self, keyword: str, city: Optional[str] = None
) -> List[Dict[str, Any]]:
try:
result = await asyncio.to_thread(
self._zhilian.search_company_jobs_by_name, keyword
)
if result and isinstance(result, dict):
data = result.get("data", {})
if isinstance(data, dict):
return data.get("list", [])
return []
return []
except Exception as e:
logger.error(f"Zhilian company search failed: {e}")
return []
def create_company_controller() -> CompanyController:

View File

@ -1,224 +0,0 @@
from typing import Dict, Any, List, Optional
from fastapi import HTTPException, BackgroundTasks
from app.services.job import DataRouterService, DataType, PlatformType
from app.log import logger
from pydantic import BaseModel, Field
class UniversalDataRequest(BaseModel):
"""通用数据存储请求模型"""
data: Dict[str, Any] = Field(..., description="要存储的数据")
data_type: DataType = Field(..., description="数据类型 (job/company)")
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
check_duplicate: bool = Field(True, description="是否检查重复数据")
class BatchDataRequest(BaseModel):
"""批量数据存储请求模型"""
data_list: List[Dict[str, Any]] = Field(..., description="要存储的数据列表")
data_type: DataType = Field(..., description="数据类型 (job/company)")
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
check_duplicate: bool = Field(True, description="是否检查重复数据")
class UniversalDataController:
"""通用数据控制器 - 处理所有平台的数据存储请求"""
def __init__(self, data_router_service: DataRouterService):
self.data_router_service = data_router_service
async def store_single_data(self, request: UniversalDataRequest) -> Dict[str, Any]:
"""存储单条数据"""
try:
# logger.info(f"接收到 {request.platform} {request.data_type} 数据存储请求")
result = await self.data_router_service.store_data(
data=request.data,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
return {
"code": 200 if result["success"] else 400,
"message": result["message"],
"data": result,
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"存储单条数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"数据存储失败: {str(e)}")
async def store_batch_data(self, request: BatchDataRequest) -> Dict[str, Any]:
"""批量存储数据"""
try:
# logger.info(
# f"接收到 {request.platform} {request.data_type} 批量数据存储请求,共 {len(request.data_list)} 条")
result = await self.data_router_service.batch_store_data(
data_list=request.data_list,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
return {
"code": 200,
"message": f"批量处理完成: 成功 {result['success']} 条,失败 {result['failed']} 条,重复 {result['duplicate']}",
"data": result,
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"批量存储数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"批量数据存储失败: {str(e)}")
async def store_single_data_async(self,
background_tasks: BackgroundTasks,
request: UniversalDataRequest) -> Dict[str, Any]:
"""异步存储单条数据"""
try:
# logger.info(f"接收到 {request.platform} {request.data_type} 异步数据存储请求")
# 添加后台任务
background_tasks.add_task(
self._async_store_single_data,
request
)
return {
"code": 202,
"message": "数据已加入异步处理队列",
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"异步存储单条数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"异步数据存储失败: {str(e)}")
async def store_batch_data_async(self,
background_tasks: BackgroundTasks,
request: BatchDataRequest) -> Dict[str, Any]:
"""异步批量存储数据"""
try:
# 打印接收日志
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
logger.info(f"📥 收到批量请求: [{platform_name}] {request.data_type.value} x{len(request.data_list)}")
# 添加后台任务
background_tasks.add_task(
self._async_store_batch_data,
request
)
return {
"code": 202,
"message": f"批量数据已加入异步处理队列,共 {len(request.data_list)}",
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"异步批量存储数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"异步批量数据存储失败: {str(e)}")
async def _async_store_single_data(self, request: UniversalDataRequest):
"""异步存储单条数据的后台任务"""
try:
result = await self.data_router_service.store_data(
data=request.data,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
if result["success"]:
logger.info(f"异步存储 {request.platform} {request.data_type} 数据成功")
else:
logger.warning(f"异步存储 {request.platform} {request.data_type} 数据失败: {result['message']}")
except Exception as e:
logger.error(f"异步存储单条数据后台任务失败: {str(e)}")
async def _async_store_batch_data(self, request: BatchDataRequest):
"""异步批量存储数据的后台任务"""
try:
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
result = await self.data_router_service.batch_store_data(
data_list=request.data_list,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
logger.info(f"✅ 批量处理完成: [{platform_name}] 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']}")
except Exception as e:
logger.error(f"异步批量存储数据后台任务失败: {str(e)}")
async def query_data(self, platform: PlatformType, data_type: DataType,
page: int = 1, page_size: int = 20) -> Dict[str, Any]:
"""查询数据"""
try:
logger.info(f"查询 {platform} {data_type} 数据,页码: {page}, 页大小: {page_size}")
offset = (page - 1) * page_size
result = await self.data_router_service.query_json_data(
platform=platform,
data_type=data_type,
limit=page_size,
offset=offset
)
return {
"code": 200,
"message": "查询数据成功",
"data": {
"items": result.get("data", []),
"total": result.get("count", 0),
"page": page,
"page_size": page_size
},
"platform": platform,
"data_type": data_type
}
except Exception as e:
logger.error(f"查询数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"查询数据失败: {str(e)}")
async def get_supported_platforms(self) -> Dict[str, Any]:
"""获取支持的平台和数据类型"""
return {
"code": 200,
"message": "获取支持的平台和数据类型成功",
"data": {
"platforms": [platform.value for platform in PlatformType],
"data_types": [data_type.value for data_type in DataType],
"platform_duplicate_keys": {
"boss": {
"job": "job_id",
"company": "company_name"
},
"qcwy": {
"job": "job_id + update_date_time",
"company": "company_name"
},
"zhilian": {
"job": "number + first_publish_time",
"company": "company_name"
}
}
}
}
# 创建控制器实例的工厂函数
def create_universal_data_controller(data_router_service: DataRouterService) -> UniversalDataController:
return UniversalDataController(data_router_service)

View File

@ -4,7 +4,6 @@ from typing import Any, Dict, List, Type
from tortoise.expressions import Q
from app.core.crud import CRUDBase
from app.models.keyword import BossKeyword, QcwyKeyword, ZhilianKeyword
@ -16,103 +15,177 @@ class KeywordController:
"zhilian": ZhilianKeyword,
}
async def get_available(self, source: str, limit: int = 1, reserve: bool = True) -> Dict[str, Any]:
"""获取当天未使用的检索条件(城市+岗位)
async def get_available(
self, source: str, limit: int = 1, reserve: bool = True, crawler_id: str = ""
) -> Dict[str, Any]:
"""获取可用关键词,优先返回断点续爬和失败重试的关键词
参数:
source: 平台标识取值为 boss|qcwy|zhilian
limit: 返回数量上限
reserve: 是否立即标记为已使用
返回:
包含 items/total/limit 的字典结构
注意使用原子操作避免并发时的竞态条件
优先级:
1. partial断点续爬
2. failed retry_count < 3失败重试
3. 全新未使用关键词
"""
model = self._ensure_model(source)
today = date.today()
now = datetime.now()
# 先统计总数
search = Q(last_requested_date__not=today) | Q(last_requested_date=None)
total = await model.filter(search).count()
# 优先级 1: 断点续爬 (partial)
partial_q = Q(crawl_status="partial", last_requested_date=today)
# 优先级 2: 失败重试 (failed, retry < 3)
failed_q = Q(crawl_status="failed", last_requested_date=today, retry_count__lt=3)
# 优先级 3: 全新关键词
fresh_q = Q(last_requested_date__not=today) | Q(last_requested_date=None)
items = []
if total > 0 and reserve:
# 使用原子操作:先更新,再查询已更新的记录
# 这样可以避免查询和标记之间的竞态条件
take = max(1, min(limit, total))
for priority, query, is_fresh in [
("partial", partial_q, False),
("failed", failed_q, False),
("fresh", fresh_q, True),
]:
count = await model.filter(query).count()
if count == 0:
continue
take = max(1, min(limit - len(items), count))
if take <= 0:
break
try:
# 获取一批未使用的记录ID随机选择
candidate_records = await model.filter(search).offset(
random.randint(0, max(0, total - take))
).limit(take).only('id')
candidate_ids = [r.id for r in candidate_records]
if candidate_ids:
# 原子性地更新这些记录(只更新未使用的)
# 使用数据库的原子UPDATE操作
updated_count = await model.filter(
id__in=candidate_ids
).filter(
Q(last_requested_date__isnull=True) | Q(last_requested_date__not=today)
).update(
last_requested_date=today,
last_requested_at=now
)
# 查询成功更新的记录
if updated_count > 0:
records = await model.filter(
id__in=candidate_ids,
last_requested_date=today
).limit(updated_count)
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
except Exception as e:
# 如果原子操作失败,回退到原来的方法
import logging
logging.warning(f"原子操作失败,回退到原方法: {e}")
take = max(1, min(limit, total))
start = 0 if total == take else random.randint(0, total - take)
records = await model.filter(search).offset(start).limit(take)
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
offset = random.randint(0, max(0, count - take))
candidates = await model.filter(query).offset(offset).limit(take).only("id")
candidate_ids = [r.id for r in candidates]
if not candidate_ids:
continue
update_fields = {
"last_requested_at": now,
"crawl_status": "crawling",
"crawl_started_at": now,
"crawler_id": crawler_id,
}
if is_fresh:
update_fields["last_requested_date"] = today
update_fields["last_completed_page"] = 0
update_fields["total_pages"] = 0
update_fields["jobs_found"] = 0
update_fields["error_message"] = ""
update_fields["retry_count"] = 0
if reserve:
ids = [r.id for r in records]
await self.mark_used(source, ids)
elif total > 0:
# 如果不需要reserve直接查询
take = max(1, min(limit, total))
start = 0 if total == take else random.randint(0, total - take)
records = await model.filter(search).offset(start).limit(take)
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
await model.filter(id__in=candidate_ids).update(**update_fields)
records = await model.filter(id__in=candidate_ids).limit(take)
for r in records:
items.append({
"id": r.id,
"city": r.city,
"job": r.job,
"last_completed_page": r.last_completed_page,
"crawl_status": r.crawl_status,
})
if len(items) >= limit:
break
except Exception:
continue
total_available = await model.filter(
partial_q | failed_q | fresh_q
).count()
return {
"code": 200,
"message": "查询可用检索条件成功",
"data": {
"items": items,
"total": total,
"total": total_available,
"limit": limit,
},
}
async def report_page_progress(
self,
source: str,
keyword_id: int,
page: int,
total_pages: int = 0,
jobs_found: int = 0,
) -> Dict[str, Any]:
"""爬虫汇报单页完成进度"""
model = self._ensure_model(source)
obj = await model.filter(id=keyword_id).first()
if not obj:
return {"code": 404, "message": "关键词不存在"}
obj.last_completed_page = page
if total_pages > 0:
obj.total_pages = total_pages
obj.jobs_found = (obj.jobs_found or 0) + jobs_found
await obj.save(update_fields=["last_completed_page", "total_pages", "jobs_found"])
return {
"code": 200,
"message": "进度更新成功",
"data": {
"keyword_id": keyword_id,
"last_completed_page": obj.last_completed_page,
"total_pages": obj.total_pages,
"jobs_found": obj.jobs_found,
},
}
async def report_crawl_complete(
self,
source: str,
keyword_id: int,
status: str,
error_message: str = "",
) -> Dict[str, Any]:
"""爬虫汇报爬取完成或失败"""
model = self._ensure_model(source)
obj = await model.filter(id=keyword_id).first()
if not obj:
return {"code": 404, "message": "关键词不存在"}
if status not in ("completed", "failed"):
return {"code": 400, "message": "status 仅支持 completed/failed"}
obj.crawl_status = status
obj.error_message = error_message
update_fields = ["crawl_status", "error_message"]
if status == "failed":
obj.retry_count = (obj.retry_count or 0) + 1
update_fields.append("retry_count")
await obj.save(update_fields=update_fields)
return {
"code": 200,
"message": f"爬取状态已更新为 {status}",
"data": {
"keyword_id": keyword_id,
"crawl_status": obj.crawl_status,
"retry_count": obj.retry_count,
},
}
async def get_stats(self, source: str, on_date: date | None = None) -> Dict[str, Any]:
"""统计指定平台在某日期的使用与未使用数量
参数:
source: 平台标识取值为 boss|qcwy|zhilian
on_date: 统计日期不传则为今天
返回:
包含 total/used/unused 的字典结构
"""
"""统计指定平台关键词使用和爬取状态"""
model = self._ensure_model(source)
d = on_date or date.today()
total = await model.all().count()
used = await model.filter(last_requested_date=d).count()
unused = max(0, total - used)
crawling = await model.filter(crawl_status="crawling", last_requested_date=d).count()
completed = await model.filter(crawl_status="completed", last_requested_date=d).count()
failed = await model.filter(crawl_status="failed", last_requested_date=d).count()
partial = await model.filter(crawl_status="partial", last_requested_date=d).count()
return {
"code": 200,
"message": "统计成功",
@ -121,19 +194,17 @@ class KeywordController:
"total": total,
"used": used,
"unused": unused,
"crawl_status": {
"crawling": crawling,
"completed": completed,
"failed": failed,
"partial": partial,
},
},
}
async def mark_used(self, source: str, ids: List[int]) -> Dict[str, Any]:
"""将检索条件标记为今日已使用
参数:
source: 平台标识取值为 boss|qcwy|zhilian
ids: 需要标记的记录主键ID列表
返回:
更新结果包括成功条数与日期
"""
"""将检索条件标记为今日已使用"""
model = self._ensure_model(source)
updated = 0
now = datetime.now()
@ -166,18 +237,7 @@ class KeywordController:
city: str | None = None,
job: str | None = None,
) -> Dict[str, Any]:
"""获取关键词列表
参数:
source: 平台标识
page: 页码
page_size: 每页数量
city: 城市过滤
job: 职位过滤
返回:
包含列表数据和分页信息的字典
"""
"""获取关键词列表"""
model = self._ensure_model(source)
queryset = model.all()
if city:
@ -193,6 +253,11 @@ class KeywordController:
"job",
"last_requested_date",
"last_requested_at",
"crawl_status",
"last_completed_page",
"total_pages",
"jobs_found",
"retry_count",
"created_at",
"updated_at",
)
@ -207,17 +272,8 @@ class KeywordController:
}
async def create_keyword(self, source: str, obj_in: Any) -> Dict[str, Any]:
"""创建关键词
参数:
source: 平台标识
obj_in: 创建数据对象
返回:
创建结果
"""
"""创建关键词"""
model = self._ensure_model(source)
# Check if already exists
exists = await model.filter(city=obj_in.city, job=obj_in.job).exists()
if exists:
return {"code": 400, "message": "该关键词组合已存在"}
@ -235,16 +291,7 @@ class KeywordController:
return {"code": 200, "message": "创建成功", "data": data}
async def update_keyword(self, source: str, id: int, obj_in: Any) -> Dict[str, Any]:
"""更新关键词
参数:
source: 平台标识
id: 记录ID
obj_in: 更新数据对象
返回:
更新结果
"""
"""更新关键词"""
model = self._ensure_model(source)
obj = await model.filter(id=id).first()
if not obj:
@ -252,7 +299,6 @@ class KeywordController:
update_data = obj_in.model_dump(exclude_unset=True)
if update_data:
# Check for duplicates if updating city or job
if "city" in update_data or "job" in update_data:
city = update_data.get("city", obj.city)
job = update_data.get("job", obj.job)
@ -275,15 +321,7 @@ class KeywordController:
return {"code": 200, "message": "更新成功", "data": data}
async def delete_keyword(self, source: str, id: int) -> Dict[str, Any]:
"""删除关键词
参数:
source: 平台标识
id: 记录ID
返回:
删除结果
"""
"""删除关键词"""
model = self._ensure_model(source)
obj = await model.filter(id=id).first()
if not obj:
@ -296,20 +334,26 @@ class KeywordController:
}
async def get_overview_stats(self) -> Dict[str, Any]:
"""获取所有平台的统计概览
返回:
包含各平台统计数据的字典
"""
"""获取所有平台的统计概览"""
today = date.today()
stats = {}
for source, model in self._model_map.items():
total = await model.all().count()
used = await model.filter(last_requested_date=today).count()
crawling = await model.filter(crawl_status="crawling", last_requested_date=today).count()
completed = await model.filter(crawl_status="completed", last_requested_date=today).count()
failed = await model.filter(crawl_status="failed", last_requested_date=today).count()
partial_count = await model.filter(crawl_status="partial", last_requested_date=today).count()
stats[source] = {
"total": total,
"used": used,
"unused": max(0, total - used),
"crawl_status": {
"crawling": crawling,
"completed": completed,
"failed": failed,
"partial": partial_count,
},
}
return {
"code": 200,
@ -318,14 +362,7 @@ class KeywordController:
}
def _ensure_model(self, source: str) -> Type:
"""根据平台标识返回对应模型类型
参数:
source: 平台标识取值为 boss|qcwy|zhilian
返回:
对应的 Tortoise ORM 模型类型
"""
"""根据平台标识返回对应模型类型"""
model = self._model_map.get(source)
if not model:
raise ValueError("不支持的平台标识")

View File

@ -1,5 +1,4 @@
import time
import os
from typing import Dict, Any, Optional, List, Tuple
import random

View File

@ -48,9 +48,11 @@ class ClickHouseManager:
async def close(self):
"""关闭ClickHouse连接"""
if self._client:
await self._client.close()
close_result = self._client.close()
if close_result is not None:
await close_result
self._client = None
# 全局ClickHouse管理器实例
clickhouse_manager = ClickHouseManager()
clickhouse_manager = ClickHouseManager()

View File

@ -4,235 +4,198 @@ from app.log import logger
class ClickHouseInitializer:
"""ClickHouse数据库初始化器"""
# 6张数据表的 DDL 定义(含 channel 列)
_TABLE_DDLS = {
"boss_job": """
CREATE TABLE IF NOT EXISTS job_data.boss_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
job_id String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"boss_company": """
CREATE TABLE IF NOT EXISTS job_data.boss_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
company_name String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"qcwy_job": """
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
job_id String DEFAULT '',
update_date_time String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"qcwy_company": """
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
company_name String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"zhilian_job": """
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
number String DEFAULT '',
first_publish_time String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
"zhilian_company": """
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '',
company_name String DEFAULT '',
channel String DEFAULT 'mini',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
""",
}
_PENDING_COMPANY_DDL = """
CREATE TABLE IF NOT EXISTS job_data.pending_company (
source String,
company_id String,
company_name String DEFAULT '',
status String DEFAULT 'pending',
error_msg String DEFAULT '',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now(),
version UInt64 DEFAULT 1
) ENGINE = ReplacingMergeTree(version)
ORDER BY (source, company_id)
SETTINGS index_granularity = 8192;
"""
_JOB_ANALYTICS_VIEW = """
CREATE OR REPLACE VIEW job_data.job_analytics AS
SELECT
'boss' as source,
job_id,
channel,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'brandName') as company_name,
JSONExtractString(json_data, 'salaryDesc') as salary_text,
0.0 as salary_min,
0.0 as salary_max,
JSONExtractString(json_data, 'cityName') as city,
JSONExtractString(json_data, 'experienceName') as experience_required,
JSONExtractString(json_data, 'degreeName') as education,
created_at
FROM job_data.boss_job
UNION ALL
SELECT
'qcwy' as source,
job_id,
channel,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workYear') as experience_required,
JSONExtractString(json_data, 'degree') as education,
created_at
FROM job_data.qcwy_job
UNION ALL
SELECT
'zhilian' as source,
number as job_id,
channel,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'salary60') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workingExp') as experience_required,
JSONExtractString(json_data, 'education') as education,
created_at
FROM job_data.zhilian_job
"""
# 需要添加 channel 列的表
_CHANNEL_MIGRATION_TABLES = [
"boss_job", "boss_company",
"qcwy_job", "qcwy_company",
"zhilian_job", "zhilian_company",
]
def __init__(self, client: AsyncClient):
self.client = client
async def create_boss_job_json_table(self):
"""创建BOSS招聘职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.boss_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
job_id String DEFAULT '', -- BOSS平台去重字段jobBaseInfoVO.jobId
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
async def _create_table(self, name: str, ddl: str) -> None:
try:
await self.client.command(create_table_sql)
logger.info("BOSS职位JSON数据表 boss_job 创建成功")
await self.client.command(ddl)
logger.info(f"{name} 创建成功")
except Exception as e:
logger.error(f"创建BOSS职位JSON数据表失败: {e}")
raise
async def create_boss_company_json_table(self):
"""创建BOSS招聘公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.boss_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("BOSS公司JSON数据表 boss_company 创建成功")
except Exception as e:
logger.error(f"创建BOSS公司JSON数据表失败: {e}")
raise
async def create_qcwy_job_json_table(self):
"""创建前程无忧职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
job_id String DEFAULT '', -- QCWY平台去重字段jobId
update_date_time String DEFAULT '', -- QCWY平台去重字段updateDateTime
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("前程无忧职位JSON数据表 qcwy_job 创建成功")
except Exception as e:
logger.error(f"创建前程无忧职位JSON数据表失败: {e}")
raise
async def create_qcwy_company_json_table(self):
"""创建前程无忧公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("前程无忧公司JSON数据表 qcwy_company 创建成功")
except Exception as e:
logger.error(f"创建前程无忧公司JSON数据表失败: {e}")
raise
async def create_zhilian_job_json_table(self):
"""创建智联招聘职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
number String DEFAULT '', -- 智联平台去重字段number
first_publish_time String DEFAULT '', -- 智联平台去重字段firstPublishTime
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("智联招聘职位JSON数据表 zhilian_job 创建成功")
except Exception as e:
logger.error(f"创建智联招聘职位JSON数据表失败: {e}")
raise
async def create_zhilian_company_json_table(self):
"""创建智联招聘公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("智联招聘公司JSON数据表 zhilian_company 创建成功")
except Exception as e:
logger.error(f"创建智联招聘公司JSON数据表失败: {e}")
logger.error(f"创建表 {name} 失败: {e}")
raise
async def create_pending_company_table(self):
"""创建待处理公司表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.pending_company (
source String,
company_id String,
company_name String DEFAULT '',
status String DEFAULT 'pending',
error_msg String DEFAULT '',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now(),
version UInt64 DEFAULT 1
) ENGINE = ReplacingMergeTree(version)
ORDER BY (source, company_id)
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("待处理公司表 pending_company 创建成功")
except Exception as e:
logger.error(f"创建待处理公司表失败: {e}")
raise
async def initialize_channel_migration(self) -> None:
"""对已存在的表执行 ALTER TABLE ADD COLUMN IF NOT EXISTS channel"""
for table in self._CHANNEL_MIGRATION_TABLES:
try:
await self.client.command(
f"ALTER TABLE job_data.{table} "
f"ADD COLUMN IF NOT EXISTS channel String DEFAULT 'mini'"
)
logger.info(f"{table} channel 列迁移完成")
except Exception as e:
logger.warning(f"{table} channel 列迁移跳过: {e}")
async def create_job_analytics_view(self):
"""创建统一的招聘数据分析视图"""
create_view_sql = """
CREATE VIEW IF NOT EXISTS job_data.job_analytics AS
SELECT
'boss' as source,
job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'brandName') as company_name,
JSONExtractString(json_data, 'salaryDesc') as salary_text,
0.0 as salary_min,
0.0 as salary_max,
JSONExtractString(json_data, 'cityName') as city,
JSONExtractString(json_data, 'experienceName') as experience_required,
JSONExtractString(json_data, 'degreeName') as education,
created_at
FROM job_data.boss_job
UNION ALL
SELECT
'qcwy' as source,
job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workYear') as experience_required,
JSONExtractString(json_data, 'degree') as education,
created_at
FROM job_data.qcwy_job
UNION ALL
SELECT
'zhilian' as source,
number as job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'salary60') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workingExp') as experience_required,
JSONExtractString(json_data, 'education') as education,
created_at
FROM job_data.zhilian_job
"""
try:
await self.client.command(create_view_sql)
logger.info("招聘数据分析视图 job_analytics 创建成功")
except Exception as e:
logger.error(f"创建招聘数据分析视图失败: {e}")
raise
async def initialize_all_tables(self):
async def initialize_all_tables(self) -> None:
"""初始化所有表"""
logger.info("开始初始化 ClickHouse 数据库表...")
try:
# 创建BOSS招聘JSON表
await self.create_boss_job_json_table()
await self.create_boss_company_json_table()
# 创建前程无忧JSON表
await self.create_qcwy_job_json_table()
await self.create_qcwy_company_json_table()
# 创建智联招聘JSON表
await self.create_zhilian_job_json_table()
await self.create_zhilian_company_json_table()
# 创建6张数据表
for name, ddl in self._TABLE_DDLS.items():
await self._create_table(name, ddl)
# 创建待处理公司表
await self.create_pending_company_table()
await self._create_table("pending_company", self._PENDING_COMPANY_DDL)
# 创建统一分析视图
await self.create_job_analytics_view()
# 对已存在的表添加 channel 列
await self.initialize_channel_migration()
# 创建/重建统一分析视图(含 channel 列)
try:
await self.client.command(self._JOB_ANALYTICS_VIEW)
logger.info("招聘数据分析视图 job_analytics 创建成功")
except Exception as e:
logger.error(f"创建招聘数据分析视图失败: {e}")
raise
logger.info("ClickHouse 数据库表初始化完成")
except Exception as e:

View File

@ -1,6 +1,8 @@
import os
from typing import Optional, Dict, Any
import jwt
from fastapi import Depends, Header, HTTPException, Request
from loguru import logger
from app.core.ctx import CTX_USER_ID
from app.models import Role, User
@ -23,7 +25,7 @@ class AuthControl:
@classmethod
async def is_authed(cls, token: str = Header(..., description="token验证")) -> Optional["User"]:
try:
if token == "dev":
if token == "dev" and os.getenv("APP_ENV", "production") == "development":
user = await User.filter().first()
user_id = user.id
else:
@ -39,7 +41,8 @@ class AuthControl:
except jwt.ExpiredSignatureError:
raise HTTPException(status_code=401, detail="登录已过期")
except Exception as e:
raise HTTPException(status_code=500, detail=f"{repr(e)}")
logger.error(f"Auth error: {repr(e)}")
raise HTTPException(status_code=500, detail="Internal authentication error")
class PermissionControl:

View File

@ -1,4 +1,5 @@
import shutil
from pathlib import Path
from aerich import Command
from fastapi import FastAPI
@ -27,8 +28,9 @@ from app.schemas.menus import MenuType
from app.settings.config import settings
from app.core.clickhouse import clickhouse_manager
from app.core.clickhouse_init import ClickHouseInitializer
from app.services.ingest.remote_push import close_http_client
from .middlewares import BackGroundTaskMiddleware, HttpAuditLogMiddleware
from .middlewares import BackGroundTaskMiddleware
from .ip_tracking import IpTrackingMiddleware
@ -42,15 +44,6 @@ def make_middlewares():
allow_headers=settings.CORS_ALLOW_HEADERS,
),
Middleware(BackGroundTaskMiddleware),
Middleware(
HttpAuditLogMiddleware,
methods=["GET", "POST", "PUT", "DELETE"],
exclude_paths=[
"/api/v1/base/access_token",
"/docs",
"/openapi.json",
],
),
Middleware(IpTrackingMiddleware),
]
return middleware
@ -266,10 +259,15 @@ async def init_apis():
async def init_db():
"""执行数据库迁移(受环境开关与并发保护控制)"""
command = Command(tortoise_config=settings.TORTOISE_ORM)
await command.init_db(safe=True)
await command.init()
migration_dir = Path("migrations") / "models"
if not migration_dir.exists():
await command.init_db(safe=True)
return
try:
await command.migrate()
except FileExistsError as e:
logger.info(f"跳过重复迁移文件生成: {e}")
except AttributeError:
logger.warning("unable to retrieve model history from database, model history will be created from scratch")
shutil.rmtree("migrations")

View File

@ -1,5 +1,4 @@
from datetime import datetime
from typing import Any
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request

View File

@ -1,11 +1,15 @@
import os
import shutil
import tempfile
import time
import uuid
from contextlib import asynccontextmanager
from pathlib import Path
from loguru import logger
class DistributedLock:
"""分布式锁封装,优先使用 Redis不可用时降级为文件锁"""
"""分布式锁封装,优先使用 Redis不可用时降级为文件锁(带 TTL"""
def __init__(self, name: str, ttl_seconds: int = 600):
self.name = name
@ -13,35 +17,77 @@ class DistributedLock:
self.token = str(uuid.uuid4())
self._use_redis = False
self._redis = None
self._file_path = f".lock_{self.name}"
self._file_path = Path(tempfile.gettempdir()) / f"jobdata_lock_{self.name}"
self._init_redis()
def _init_redis(self) -> None:
try:
import redis # type: ignore
import redis.asyncio as aioredis
from app.settings.config import settings
self._redis = redis.Redis(
host=getattr(settings, "REDIS_HOST", None) or "",
host = getattr(settings, "REDIS_HOST", None) or ""
if not host:
return
self._redis = aioredis.Redis(
host=host,
port=getattr(settings, "REDIS_PORT", 6379),
db=getattr(settings, "REDIS_DB", 0),
password=getattr(settings, "REDIS_PASS", None) or None,
socket_timeout=3,
)
# 尝试 ping
if self._redis.ping():
self._use_redis = True
self._use_redis = True
except Exception:
self._use_redis = False
async def _ping_redis(self) -> bool:
if not self._redis:
return False
try:
return bool(await self._redis.ping())
except Exception:
self._use_redis = False
return False
async def acquire(self) -> bool:
"""获取锁,返回是否成功"""
if self._use_redis and self._redis is not None:
try:
# NX+EX 设置锁,避免竞争
return bool(self._redis.set(f"lock:{self.name}", self.token, nx=True, ex=self.ttl))
if not await self._ping_redis():
return self._try_file_lock()
return bool(await self._redis.set(
f"lock:{self.name}", self.token, nx=True, ex=self.ttl
))
except Exception:
pass
# 文件锁降级(单机安全)
return self._try_file_lock()
def _try_file_lock(self) -> bool:
"""文件锁(带 TTL 过期检查),使用绝对路径"""
lock_dir = self._file_path
lock_meta = lock_dir / "meta"
try:
os.mkdir(self._file_path)
lock_dir.mkdir()
lock_meta.write_text(str(time.time()))
return True
except FileExistsError:
if lock_meta.exists():
try:
created = float(lock_meta.read_text())
if time.time() - created > self.ttl:
logger.warning(
f"Stale file lock detected for '{self.name}', "
f"age={time.time() - created:.0f}s > ttl={self.ttl}s. Cleaning up."
)
shutil.rmtree(lock_dir, ignore_errors=True)
try:
lock_dir.mkdir()
lock_meta.write_text(str(time.time()))
return True
except Exception:
return False
except (ValueError, OSError):
pass
return False
except Exception:
return False
@ -49,15 +95,14 @@ class DistributedLock:
"""释放锁"""
if self._use_redis and self._redis is not None:
try:
# 简单释放;生产建议使用 Lua 脚本确保原子性
key = f"lock:{self.name}"
val = self._redis.get(key)
val = await self._redis.get(key)
if val and val.decode() == self.token:
self._redis.delete(key)
await self._redis.delete(key)
except Exception:
pass
try:
os.rmdir(self._file_path)
shutil.rmtree(self._file_path, ignore_errors=True)
except Exception:
pass
@ -72,4 +117,4 @@ class DistributedLock:
yield False
finally:
if acquired:
await self.release()
await self.release()

View File

@ -8,6 +8,7 @@ from pathlib import Path
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from tortoise.exceptions import OperationalError
from app.core.clickhouse import clickhouse_manager
from app.core.locks import DistributedLock
@ -50,9 +51,6 @@ async def stats_job():
("boss", "job", "boss_job"),
("qcwy", "job", "qcwy_job"),
("zhilian", "job", "zhilian_job"),
("boss", "company", "boss_company"),
("qcwy", "company", "qcwy_company"),
("zhilian", "company", "zhilian_company"),
]
results: list[dict] = []
for source, data_type, table in tables:
@ -170,32 +168,46 @@ async def ecs_full_pipeline_job():
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def _get_active_proxy() -> "str | None":
"""从数据库读取可用代理配置,优先 platform='all'"""
from app.models.cleaning import ProxyConfig
proxy_obj = await ProxyConfig.filter(is_active=True).order_by("platform").first()
if proxy_obj:
logger.info(f"company_cleaning_job using proxy: {proxy_obj.name} ({proxy_obj.proxy_url[:30]}...)")
return proxy_obj.proxy_url
return None
async def company_cleaning_job():
"""每5分钟执行自动清洗待处理公司数据"""
from app.services.company_cleaner import company_cleaner
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "company_cleaning_job"
# Use a shorter lock TTL since it runs frequently
lock = DistributedLock(name=task_name, ttl_seconds=300)
async with lock.context() as acquired:
if not acquired:
logger.info("company_cleaning_job skipped: lock not acquired")
return
try:
logger.info("Running automated company cleaning job...")
# 1. Collect new data (with 7-day rule)
# 减少数量确保在5分钟内完成
await company_cleaner.collect_pending_companies(limit=50)
# 2. Process pending data with small delay to be polite
# 2. 从数据库读取代理配置
proxy = await _get_active_proxy()
# 3. Process pending data with small delay to be polite
# 减少数量确保在5分钟内完成30个公司每个约3-5秒加上延迟总计约2-3分钟
# 这样留出时间给收集任务和其他操作
await company_cleaner.process_pending_companies(limit=30, max_delay_seconds=1)
await company_cleaner.process_pending_companies(limit=30, max_delay_seconds=1, proxy=proxy)
duration = (datetime.now() - started_at).total_seconds()
logger.info(f"company_cleaning_job completed in {duration:.2f} seconds")
await _record_task_run(task_id, task_name, "success", started_at)
@ -207,12 +219,12 @@ async def company_cleaning_job():
async def daily_cleanup_job():
"""每天 00:05 执行:清理已完成的任务记录"""
from app.services.company_cleaner import company_cleaner
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "daily_cleanup_job"
lock = DistributedLock(name=task_name, ttl_seconds=3600)
async with lock.context() as acquired:
if not acquired:
return
@ -226,6 +238,34 @@ async def daily_cleanup_job():
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def stale_crawl_cleanup_job():
"""每10分钟执行将超过30分钟仍为crawling状态的关键词降级为partial"""
from app.models.keyword import BossKeyword, QcwyKeyword, ZhilianKeyword
task_name = "stale_crawl_cleanup"
lock = DistributedLock(name=task_name, ttl_seconds=300)
async with lock.context() as acquired:
if not acquired:
return
threshold = datetime.now() - timedelta(minutes=30)
try:
for model in [BossKeyword, QcwyKeyword, ZhilianKeyword]:
count = await model.filter(
crawl_status="crawling",
crawl_started_at__lt=threshold,
).update(crawl_status="partial")
if count:
logger.info(f"{model.__name__}: {count} stale crawl tasks marked as partial")
except OperationalError as e:
error_text = str(e)
if "crawl_status" in error_text or "crawl_started_at" in error_text:
logger.warning(f"stale_crawl_cleanup skipped due to missing keyword crawl columns: {error_text}")
return
raise
async def _post_with_retry(body: str):
"""带失败重试的统计结果上报"""
import httpx
@ -270,7 +310,7 @@ def _build_email_html(subject: str, payload: dict) -> str:
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
if "anomalies" in payload:
rows = "".join(
f"<tr><td>{a.get('source')}</td><td>{a.get('ip')}</td><td>{a.get('date')}</td></tr>" for a in payload.get("anomalies", [])
f"<tr><td>{a.get('source')}</td><td>{a.get('ip')}</td><td>{a.get('last_report_at', 'N/A')}</td></tr>" for a in payload.get("anomalies", [])
)
table = f"<table><thead><tr><th>来源</th><th>IP</th><th>日期</th></tr></thead><tbody>{rows}</tbody></table>"
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
@ -326,8 +366,8 @@ def start_scheduler():
)
# 每6小时触发
scheduler.add_job(stats_job, CronTrigger(second=0, minute=0, hour="*/6"), id="stats_job", replace_existing=True)
# 每6小时触发执行 ECS 全流程
scheduler.add_job(ecs_full_pipeline_job, CronTrigger(second=0, minute=0, hour="*/6"), id="ecs_full_pipeline", replace_existing=True)
# 每6小时触发执行 ECS 全流程偏移30分钟避免与 stats_job 同时执行)
scheduler.add_job(ecs_full_pipeline_job, CronTrigger(second=0, minute=30, hour="*/6"), id="ecs_full_pipeline", replace_existing=True)
# 每10分钟触发告警
scheduler.add_job(ip_alert_job, CronTrigger(second=0, minute="*/10"), id="ip_alert_job", replace_existing=True)
# 每5分钟执行自动清洗
@ -341,6 +381,8 @@ def start_scheduler():
)
# 每天 00:05 执行:清理历史记录
scheduler.add_job(daily_cleanup_job, CronTrigger(second=0, minute=5, hour=0), id="daily_cleanup_job", replace_existing=True)
# 每10分钟执行检测僵死爬取任务并降级为partial
scheduler.add_job(stale_crawl_cleanup_job, CronTrigger(second=0, minute="*/10"), id="stale_crawl_cleanup", replace_existing=True)
scheduler.start()

View File

@ -1,5 +1,6 @@
# 新增model需要在这里导入
from .admin import *
from .company import *
from .metrics import *
from .keyword import *
from .cleaning import *

58
app/models/company.py Normal file
View File

@ -0,0 +1,58 @@
from tortoise import fields
from .base import BaseModel, TimestampMixin
class BaseCompanyModel(BaseModel, TimestampMixin):
source_company_id = fields.CharField(max_length=128, unique=True, index=True, description="来源站点公司ID")
company_name = fields.CharField(max_length=255, index=True, description="公司名称")
company_type = fields.CharField(max_length=100, null=True, description="公司性质")
industry = fields.CharField(max_length=255, null=True, description="行业")
company_size = fields.CharField(max_length=100, null=True, description="公司规模")
financing_stage = fields.CharField(max_length=100, null=True, description="融资阶段")
city = fields.CharField(max_length=100, null=True, description="城市")
address = fields.TextField(null=True, description="地址")
website = fields.CharField(max_length=500, null=True, description="官网")
logo_url = fields.CharField(max_length=1000, null=True, description="Logo地址")
description = fields.TextField(null=True, description="公司简介")
raw_json = fields.JSONField(description="原始公司JSON")
first_crawled_at = fields.DatetimeField(index=True, description="首次抓取时间")
last_crawled_at = fields.DatetimeField(index=True, description="最后抓取时间")
class Meta:
abstract = True
class BossCompany(BaseCompanyModel):
class Meta:
table = "boss_company"
class QcwyCompany(BaseCompanyModel):
class Meta:
table = "qcwy_company"
class ZhilianCompany(BaseCompanyModel):
class Meta:
table = "zhilian_company"
class CompanyCleaningQueue(BaseModel, TimestampMixin):
source = fields.CharField(max_length=20, index=True, description="来源平台")
company_id = fields.CharField(max_length=128, index=True, description="来源站点公司ID")
company_name = fields.CharField(max_length=255, null=True, description="公司名称")
status = fields.CharField(max_length=20, default="pending", index=True, description="状态")
error_msg = fields.TextField(null=True, description="错误信息")
retry_count = fields.IntField(default=0, description="重试次数")
started_at = fields.DatetimeField(null=True, description="开始处理时间")
finished_at = fields.DatetimeField(null=True, description="处理结束时间")
jobs_fetched = fields.IntField(default=0, description="抓取到的职位数")
jobs_stored = fields.IntField(default=0, description="写入ClickHouse成功数")
jobs_duplicate = fields.IntField(default=0, description="职位重复数")
jobs_failed = fields.IntField(default=0, description="职位写入失败数")
jobs_error_msg = fields.TextField(null=True, description="职位同步错误信息")
class Meta:
table = "company_cleaning_queue"
unique_together = (("source", "company_id"),)

View File

@ -8,6 +8,17 @@ class BaseKeyword(Model):
job = fields.CharField(max_length=128)
last_requested_date = fields.DateField(null=True)
last_requested_at = fields.DatetimeField(null=True)
# 爬取状态管理
crawl_status = fields.CharField(max_length=16, default="idle")
last_completed_page = fields.IntField(default=0)
total_pages = fields.IntField(default=0)
jobs_found = fields.IntField(default=0)
crawl_started_at = fields.DatetimeField(null=True)
crawler_id = fields.CharField(max_length=64, default="")
error_message = fields.TextField(default="")
retry_count = fields.IntField(default=0)
created_at = fields.DatetimeField(auto_now_add=True)
updated_at = fields.DatetimeField(auto_now=True)

View File

@ -1,5 +1,3 @@
import math
from collections.abc import Generator
from datetime import datetime
from typing import Optional, Dict, Any, List
from clickhouse_connect.driver import AsyncClient

View File

@ -1,5 +1,5 @@
from datetime import datetime
from typing import Optional, Dict, Any, List
from typing import Optional, Dict, List
from pydantic import BaseModel, Field
@ -12,6 +12,7 @@ class AnalyticsQueryParams(BaseModel):
position_name: Optional[str] = Field(None, description="职位名称筛选")
industry: Optional[str] = Field(None, description="行业筛选")
experience_required: Optional[str] = Field(None, description="经验要求筛选")
channel: Optional[str] = Field(None, description="渠道筛选 (mini/web/app)")
limit: int = Field(10, ge=1, le=100, description="返回结果数量限制")

View File

@ -1,5 +1,5 @@
from datetime import date, datetime
from typing import Optional
from typing import Literal, Optional
from pydantic import BaseModel, Field
@ -22,8 +22,28 @@ class KeywordOut(KeywordBase):
id: int
last_requested_date: Optional[date] = None
last_requested_at: Optional[datetime] = None
crawl_status: str = "idle"
last_completed_page: int = 0
total_pages: int = 0
jobs_found: int = 0
retry_count: int = 0
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class PageProgressRequest(BaseModel):
source: str = Field(..., pattern="^(boss|qcwy|zhilian)$")
keyword_id: int
page: int = Field(..., ge=1)
total_pages: int = Field(0, ge=0)
jobs_found: int = Field(0, ge=0)
class CrawlCompleteRequest(BaseModel):
source: str = Field(..., pattern="^(boss|qcwy|zhilian)$")
keyword_id: int
status: Literal["completed", "failed"]
error_message: str = ""

View File

@ -1,6 +1,6 @@
from pydantic import BaseModel, Field
from datetime import datetime
from typing import Optional, List, Dict, Any
from typing import Optional, List
class BossTokenCreate(BaseModel):

View File

@ -1,32 +1,44 @@
import asyncio
import csv
import io
import re
import time
from typing import List, Dict, Any, Union, Optional
from fastapi import UploadFile
from loguru import logger
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
from app.services.job import DataRouterService, DataType, PlatformType
from app.services.company_jobs_sync import CompanyJobsSyncService
from app.services.company_storage import company_storage
from app.services.ingest import IngestService
from app.core.clickhouse import clickhouse_manager
from app.models.token import BossToken
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
class CleaningService:
_TOKEN_REFRESH_INTERVAL = 3600 # 1小时刷新一次
def __init__(self):
self.boss_service = BossService()
self.qcwy_service = QcwyService()
self.zhilian_service = ZhilianService()
self.data_router = None
self.company_jobs_sync = CompanyJobsSyncService()
self.data_router: Optional[IngestService] = None
self._boss_token_loaded = False
self._token_loaded_at: float = 0
def _apply_proxy(self, proxy: Optional[str]) -> None:
self.boss_service.set_proxy(proxy)
self.qcwy_service.set_proxy(proxy)
self.zhilian_service.set_proxy(proxy)
self.company_jobs_sync.set_proxy(proxy)
async def _ensure_boss_token_loaded(self) -> None:
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
now = time.time()
if (self._boss_token_loaded
and self.boss_service.login_data.get("mpt")
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL):
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if not token_obj:
@ -34,21 +46,21 @@ class CleaningService:
return
self.boss_service.set_login_data(token_obj.mpt or "", "")
self._boss_token_loaded = True
self._token_loaded_at = now
async def get_data_router(self) -> DataRouterService:
async def get_data_router(self) -> IngestService:
if not self.data_router:
client = await clickhouse_manager.get_client()
self.data_router = DataRouterService(client)
self.data_router = IngestService(client)
return self.data_router
async def parse_file(self, file: UploadFile) -> List[str]:
content = await file.read()
filename = file.filename
targets = []
if filename.endswith('.csv'):
text = content.decode('utf-8')
# Handle BOM
if text.startswith('\uFEFF'):
text = text[1:]
reader = csv.reader(io.StringIO(text))
@ -58,9 +70,22 @@ class CleaningService:
else:
text = content.decode('utf-8')
targets = [line.strip() for line in text.splitlines() if line.strip()]
return [t for t in targets if t]
async def _store_company_record(
self,
source: str,
data: Dict[str, Any],
company_id: str,
) -> Dict[str, Any]:
result = await company_storage.upsert_company(source, data, company_id=company_id)
result["duplicate"] = False
result["remote_sent"] = False
result["message"] = "公司数据已写入MySQL"
result["original_data"] = data
return result
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
try:
await self._ensure_boss_token_loaded()
@ -90,7 +115,7 @@ class CleaningService:
result = await self.clean_qcwy_company_jobs(target)
elif platform == "zhilian":
result = await self.clean_zhilian_company_jobs(target)
if not result:
return {
"success": False,
@ -99,9 +124,7 @@ class CleaningService:
"storage_status": "failed",
"remote_sent": False
}
# Normalize result if it's just a dict (from store_data)
# If it's a boolean (from some legacy paths), wrap it
if isinstance(result, bool):
return {
"success": result,
@ -110,15 +133,15 @@ class CleaningService:
"storage_status": "unknown",
"remote_sent": False
}
# If it's the dict returned by DataRouterService.store_data
return {
"success": result.get("success", False),
"target": target,
"error": result.get("message") if not result.get("success") else None,
"storage_status": "duplicate" if result.get("duplicate") else "saved",
"remote_sent": result.get("remote_sent", False),
"data_summary": result.get("data_summary"), # Optional: summary of data
"data_summary": result.get("data_summary"),
"jobs_summary": result.get("jobs_summary"),
"original_data": result.get("original_data")
}
@ -145,8 +168,7 @@ class CleaningService:
router = await self.get_data_router()
data = None
result = None
# Try to extract ID if target looks like a URL
if platform == "boss":
match = re.search(r'job_detail/([^.]+)\.html', target)
if match:
@ -159,55 +181,50 @@ class CleaningService:
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
if match:
target = match.group(1)
if platform == "boss":
data = self.boss_service.get_job_detail_by_id(target)
data = await asyncio.to_thread(self.boss_service.get_job_detail_by_id, target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.BOSS)
result = await router.store_single("boss", "mini", "job", data)
elif platform == "qcwy":
data = self.qcwy_service.get_job_detail(target)
data = await asyncio.to_thread(self.qcwy_service.get_job_detail, target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.QCWY)
result = await router.store_single("qcwy", "mini", "job", data)
elif platform == "zhilian":
data = self.zhilian_service.get_job_detail(target)
data = await asyncio.to_thread(self.zhilian_service.get_job_detail, target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN)
result = await router.store_single("zhilian", "mini", "job", data)
if result and isinstance(result, dict) and data:
result['original_data'] = data
return result
return False
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
if platform == "boss":
res = self.boss_service.search_jobs(target)
res = await asyncio.to_thread(self.boss_service.search_jobs, target)
if res and res.get('zpData') and res['zpData'].get('list'):
# For company name search, we might get multiple jobs.
# Currently we just return the result of the LAST one for simplicity in status reporting,
# or we should change logic to handle list.
# For now, let's just process them and return the last result as indicative.
last_result = None
for job in res['zpData']['list']:
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
last_result = await router.store_single("boss", "mini", "job", job)
if last_result and isinstance(last_result, dict):
# For search results, we store the full search response as original data
last_result['original_data'] = res
return last_result if last_result else False
elif platform == "qcwy":
res = self.qcwy_service.search_jobs(target)
res = await asyncio.to_thread(self.qcwy_service.search_jobs, target)
if res:
last_result = None
for job in res:
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
last_result = await router.store_single("qcwy", "mini", "job", job)
if last_result and isinstance(last_result, dict):
last_result['original_data'] = res
return last_result if last_result else False
elif platform == "zhilian":
res = self.zhilian_service.search_company_jobs_by_name(target)
res = await asyncio.to_thread(self.zhilian_service.search_company_jobs_by_name, target)
if res and isinstance(res, dict):
data = res.get("data") or {}
items = data.get("list") or []
@ -215,148 +232,95 @@ class CleaningService:
items = []
last_result = None
for job in items:
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
last_result = await router.store_single("zhilian", "mini", "job", job)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = res
return last_result if last_result else False
return False
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
data = None
result = None
if platform == "boss":
data = self.boss_service.get_company_detail_by_id(target)
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, target)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS)
result = await self._store_company_record("boss", data, target)
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("boss", target)
elif platform == "qcwy":
company_id = target
match = re.match(r"^co(\d+)$", company_id)
if match:
company_id = match.group(1)
data = self.qcwy_service.get_company_info(company_id)
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY)
result = await self._store_company_record("qcwy", data, company_id)
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("qcwy", company_id)
elif platform == "zhilian":
data = self.zhilian_service.get_company_detail(target)
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, target)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN)
result = await self._store_company_record("zhilian", data, target)
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("zhilian", target)
if result and isinstance(result, dict) and data:
result['original_data'] = data
return result
return False
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
match = re.search(r'gongsi/([^.]+)\.html', target)
if match:
company_id = match.group(1)
data = self.boss_service.get_company_jobs_by_id(company_id)
if not data:
return False
jobs = []
zp_data = data.get("zpData") if isinstance(data, dict) else None
if isinstance(zp_data, dict):
if isinstance(zp_data.get("jobList"), list):
jobs = zp_data.get("jobList") or []
elif isinstance(zp_data.get("list"), list):
jobs = zp_data.get("list") or []
if not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
result = await self.company_jobs_sync.sync_company_jobs("boss", company_id)
return result if result.get("jobs_fetched", 0) > 0 else False
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
match = re.match(r'^co(\d+)$', company_id)
if match:
company_id = match.group(1)
data = self.qcwy_service.get_company_jobs_by_id(company_id)
if not data:
return False
jobs_list = qcwy_extract_items(data)
jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else []
if not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
result = await self.company_jobs_sync.sync_company_jobs("qcwy", company_id)
return result if result.get("jobs_fetched", 0) > 0 else False
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
data = self.zhilian_service.get_company_jobs_by_id(company_id)
if not data or not isinstance(data, dict):
return False
data_field = data.get("data") or {}
jobs = data_field.get("list") or []
if not isinstance(jobs, list) or not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
result = await self.company_jobs_sync.sync_company_jobs("zhilian", company_id)
return result if result.get("jobs_fetched", 0) > 0 else False
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'job_detail/([^.]+)\.html', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "boss")
company_match = re.search(r'gongsi/([^.]+)\.html', url)
if company_match:
return await self.clean_by_company_id(company_match.group(1), "boss")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "boss")
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'/(\d+)\.html', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "qcwy")
# Fallback: assume it's a job ID
company_match = re.search(r'co(\d+)', url, re.IGNORECASE)
if company_match:
return await self.clean_by_company_id(company_match.group(1), "qcwy")
return await self.clean_by_job_id(url, "qcwy")
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "zhilian")
# Fallback: assume it's a job ID
company_match = re.search(r'/company/([A-Za-z0-9]+)', url)
if company_match:
return await self.clean_by_company_id(company_match.group(1), "zhilian")
return await self.clean_by_job_id(url, "zhilian")
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:

View File

@ -1,32 +1,69 @@
import asyncio
import json
import random
from datetime import datetime
from typing import Any, Dict, List, Optional
import time
from typing import Any, Optional
from loguru import logger
from app.core.clickhouse import clickhouse_manager
from app.models.company import CompanyCleaningQueue
from app.models.token import BossToken
from app.services.company_jobs_sync import CompanyJobsSyncService
from app.services.company_storage import company_storage, normalize_company_id
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
SOURCE_CONFIGS: dict[str, dict[str, Any]] = {
"zhilian": {
"job_table": "zhilian_job",
"company_id_expr": "JSONExtractString(json_data, 'companyNumber')",
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
"days_back": 30,
"max_query_limit": None,
},
"qcwy": {
"job_table": "qcwy_job",
"company_id_expr": "JSONExtractString(json_data, 'coId')",
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
"days_back": 30,
"max_query_limit": 5000,
},
"boss": {
"job_table": "boss_job",
"company_id_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'encryptBrandId')",
"company_name_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'brandName')",
"days_back": 30,
"max_query_limit": None,
},
}
class CompanyCleaner:
_TOKEN_REFRESH_INTERVAL = 3600
def __init__(self):
self.boss_service = BossService()
self.qcwy_service = QcwyService()
self.zhilian_service = ZhilianService()
self.company_jobs_sync = CompanyJobsSyncService()
self._boss_token_loaded = False
self._token_loaded_at: float = 0
def _apply_proxy(self, proxy: Optional[str]) -> None:
self.boss_service.set_proxy(proxy)
self.qcwy_service.set_proxy(proxy)
self.zhilian_service.set_proxy(proxy)
self.company_jobs_sync.set_proxy(proxy)
async def _ensure_boss_token_loaded(self) -> None:
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
now = time.time()
if (
self._boss_token_loaded
and self.boss_service.login_data.get("mpt")
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL
):
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if not token_obj:
@ -34,420 +71,138 @@ class CompanyCleaner:
return
self.boss_service.set_login_data(token_obj.mpt or "", "")
self._boss_token_loaded = True
self._token_loaded_at = now
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None):
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None) -> dict[str, Any]:
client = await clickhouse_manager.get_client()
logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
if source is None or source == "zhilian":
await self._collect_zhilian(client, limit)
if source is None or source == "qcwy":
await self._collect_qcwy(client, limit)
if source is None or source == "boss":
await self._collect_boss(client, limit)
summary: dict[str, Any] = {
"total_created": 0,
"sources": {},
}
sources = [s for s in SOURCE_CONFIGS if source is None or source == s]
# 并行采集各平台
tasks = [self._collect_source(client, s, limit) for s in sources]
results = await asyncio.gather(*tasks, return_exceptions=True)
for s, result in zip(sources, results):
if isinstance(result, Exception):
logger.error(f"Error collecting {s}: {result}")
summary["sources"][s] = {"source": s, "created_count": 0, "error": str(result)}
else:
summary["sources"][s] = result
summary["total_created"] += result["created_count"]
logger.info("Finished collecting pending companies.")
return summary
async def _collect_zhilian(self, client, limit: int):
logger.info("Collecting Zhilian companies...")
# 优化先获取已存在的公司ID避免在子查询中读取json_data
# 使用PREWHERE提前过滤时间范围减少需要读取的数据量
# 检查90天内已处理的公司避免重复请求
days_back_existing = 90 # 查询最近90天的数据避免重复请求已处理过的公司
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
FROM job_data.zhilian_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyNumber') != ''
LIMIT 50000
"""
# 添加重试机制
existing_result = None
existing_cids = set() # 默认使用空集合
for attempt in range(3):
try:
logger.info(f"Querying existing Zhilian companies (attempt {attempt+1})...")
existing_result = await client.query(existing_companies_query)
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
break
except Exception as e:
error_str = str(e).lower()
if "memory" in error_str or "memory_limit" in error_str:
if attempt == 0:
days_back_existing = 1
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
FROM job_data.zhilian_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyNumber') != ''
LIMIT 5000
"""
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
elif attempt == 1:
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
FROM job_data.zhilian_company SAMPLE 0.1
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyNumber') != ''
LIMIT 2000
"""
logger.warning(f"Memory error persists, using SAMPLE 0.1")
else:
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
logger.warning("Using empty set for existing_cids, continuing with collection...")
existing_cids = set()
break
else:
logger.error(f"Non-memory error while querying existing companies: {e}")
raise
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'zhilian'"
pending_result = await client.query(pending_query)
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
# 构建排除列表
exclude_cids = existing_cids | pending_cids
# 优化添加时间范围过滤只查询最近30天的数据减少扫描量
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'companyNumber') as cid,
JSONExtractString(json_data, 'companyName') as cname
FROM job_data.zhilian_job
PREWHERE created_at > now() - INTERVAL 30 DAY
WHERE json_data != ''
AND JSONExtractString(json_data, 'companyNumber') != ''
LIMIT {limit * 2}
"""
logger.info(f"Executing SQL for Zhilian (limit={limit * 2}): {query[:500]}...")
result = await client.query(query)
if not result.result_rows:
return
# 在 Python 中过滤掉已存在的和待处理的
rows: List[Dict[str, Any]] = []
for cid, cname in result.result_rows:
if not cid or cid in exclude_cids:
continue
if len(rows) >= limit:
break
rows.append(
{
"source": "zhilian",
"company_id": cid,
"company_name": cname,
"status": "pending",
"created_at": datetime.now(),
"updated_at": datetime.now(),
}
)
await self._insert_pending(client, rows)
logger.info(f"Added {len(rows)} Zhilian companies to pending.")
async def _collect_source(self, client, source: str, limit: int) -> dict[str, Any]:
config = SOURCE_CONFIGS[source]
async def _collect_qcwy(self, client, limit: int):
logger.info("Collecting QCWY companies...")
# 优化先获取已存在的公司ID避免在子查询中读取json_data
# 使用PREWHERE提前过滤时间范围减少需要读取的数据量
# 检查90天内已处理的公司避免重复请求
days_back_existing = 90 # 查询最近90天的数据避免重复请求已处理过的公司
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
FROM job_data.qcwy_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyId') != ''
LIMIT 50000
"""
# 添加重试机制
existing_result = None
existing_cids = set() # 默认使用空集合
for attempt in range(3):
try:
logger.info(f"Querying existing QCWY companies (attempt {attempt+1})...")
existing_result = await client.query(existing_companies_query)
# 查询成功,提取结果
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
break
except Exception as e:
error_str = str(e).lower()
if "memory" in error_str or "memory_limit" in error_str:
if attempt == 0:
# 第一次失败:进一步减少时间范围
days_back_existing = 1
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
FROM job_data.qcwy_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyId') != ''
LIMIT 5000
"""
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
elif attempt == 1:
# 第二次失败:使用采样
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
FROM job_data.qcwy_company SAMPLE 0.1
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyId') != ''
LIMIT 2000
"""
logger.warning(f"Memory error persists, using SAMPLE 0.1")
else:
# 最后一次尝试也失败,使用空集合继续执行(避免阻塞整个流程)
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
logger.warning("Using empty set for existing_cids, continuing with collection...")
existing_cids = set()
break
else:
# 其他错误直接抛出
logger.error(f"Non-memory error while querying existing companies: {e}")
raise
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'qcwy'"
pending_result = await client.query(pending_query)
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
# 构建排除列表
exclude_cids = existing_cids | pending_cids
# 优化策略:
# 1. 减少时间范围从30天减少到7天大幅减少扫描的数据量
# 2. 减少LIMIT从limit*2减少到更小的值减少内存占用
# 3. 使用更严格的PREWHERE条件先过滤时间再过滤空json_data和超大JSON
# 4. 限制JSON大小过滤掉过大的json_data可能包含大量嵌套数据
# 5. 分批查询如果limit较大分批处理每次查询更少的数据
days_back = 7 # 从30天减少到7天减少扫描量
# 注意不使用length(json_data)检查,因为它需要读取整个列来计算长度
query_limit = min(limit * 2, 100) # 限制最大查询数量,避免内存超限
# 分批查询策略如果limit较大分批处理
result = None
for attempt in range(3): # 最多尝试3次
try:
# 根据尝试次数调整参数
if attempt == 1:
# 第一次失败后减少时间范围到3天
days_back = 3
query_limit = min(query_limit, 50)
logger.warning(f"Retry {attempt}: Reducing time range to {days_back} days and limit to {query_limit}")
elif attempt == 2:
# 第二次失败后:使用采样
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'coId') as cid,
JSONExtractString(json_data, 'companyName') as cname
FROM job_data.qcwy_job SAMPLE 0.1
PREWHERE created_at > now() - INTERVAL {days_back} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'coId') != ''
LIMIT {query_limit}
"""
logger.warning(f"Retry {attempt}: Using SAMPLE 0.1 to reduce memory usage")
result = await client.query(query)
break
# 正常查询或第一次重试
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'coId') as cid,
JSONExtractString(json_data, 'companyName') as cname
FROM job_data.qcwy_job
PREWHERE created_at > now() - INTERVAL {days_back} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'coId') != ''
LIMIT {query_limit}
"""
logger.info(f"Executing SQL for QCWY (limit={query_limit}, days={days_back}, attempt={attempt+1}): {query[:400]}...")
result = await client.query(query)
break
except Exception as e:
error_str = str(e).lower()
# 如果查询失败(可能是内存超限),继续重试
if "memory" in error_str or "memory_limit" in error_str:
if attempt < 2:
logger.warning(f"Memory error on attempt {attempt+1}: {e}")
continue
else:
# 最后一次尝试也失败,抛出异常
logger.error(f"Query failed after {attempt+1} attempts: {e}")
raise
else:
# 其他错误直接抛出
logger.error(f"Query failed with non-memory error: {e}")
raise
if not result or not result.result_rows:
logger.info("No QCWY companies found in query result.")
return
# 在 Python 中过滤掉已存在的和待处理的
rows: List[Dict[str, Any]] = []
for cid, cname in result.result_rows:
if not cid or cid in exclude_cids:
continue
if len(rows) >= limit:
break
rows.append(
{
"source": "qcwy",
"company_id": cid,
"company_name": cname,
"status": "pending",
"created_at": datetime.now(),
"updated_at": datetime.now(),
}
)
if rows:
await self._insert_pending(client, rows)
logger.info(f"Added {len(rows)} QCWY companies to pending.")
else:
logger.info("No new QCWY companies found after filtering.")
# 先从 MySQL 取出该平台所有已入队/已入库的 company_idPython 侧快速排除
all_queued = set(await CompanyCleaningQueue.filter(source=source).values_list("company_id", flat=True))
all_existing = await company_storage.get_all_company_ids(source)
exclude_ids = all_queued | all_existing
logger.info(f"Loaded {len(exclude_ids)} known {source} company IDs for exclusion")
async def _collect_boss(self, client, limit: int):
logger.info("Collecting Boss companies...")
# 优化先获取已存在的公司ID避免在子查询中读取json_data
# 使用PREWHERE提前过滤时间范围减少需要读取的数据量
# 检查90天内已处理的公司避免重复请求
days_back_existing = 90 # 查询最近90天的数据避免重复请求已处理过的公司
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
FROM job_data.boss_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'brandId') != ''
LIMIT 50000
"""
# 添加重试机制
existing_result = None
existing_cids = set() # 默认使用空集合
for attempt in range(3):
try:
logger.info(f"Querying existing Boss companies (attempt {attempt+1})...")
existing_result = await client.query(existing_companies_query)
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
break
except Exception as e:
error_str = str(e).lower()
if "memory" in error_str or "memory_limit" in error_str:
if attempt == 0:
days_back_existing = 1
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
FROM job_data.boss_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'brandId') != ''
"""
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
elif attempt == 1:
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
FROM job_data.boss_company SAMPLE 0.1
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'brandId') != ''
"""
logger.warning(f"Memory error persists, using SAMPLE 0.1")
else:
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
logger.warning("Using empty set for existing_cids, continuing with collection...")
existing_cids = set()
break
else:
logger.error(f"Non-memory error while querying existing companies: {e}")
raise
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'boss'"
pending_result = await client.query(pending_query)
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
# 构建排除列表
exclude_cids = existing_cids | pending_cids
# 优化添加时间范围过滤只查询最近30天的数据减少扫描量
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'brandId') as cid,
JSONExtractString(json_data, 'brandName') as cname
FROM job_data.boss_job
PREWHERE created_at > now() - INTERVAL 30 DAY
WHERE json_data != ''
AND JSONExtractString(json_data, 'brandId') != ''
LIMIT {limit * 2}
"""
logger.info(f"Executing SQL for Boss (limit={limit * 2}): {query[:500]}...")
result = await client.query(query)
if not result.result_rows:
return
# 在 Python 中过滤掉已存在的和待处理的
rows: List[Dict[str, Any]] = []
for cid, cname in result.result_rows:
if not cid or cid in exclude_cids:
continue
if len(rows) >= limit:
break
rows.append(
{
"source": "boss",
"company_id": cid,
"company_name": cname,
"status": "pending",
"created_at": datetime.now(),
"updated_at": datetime.now(),
}
)
await self._insert_pending(client, rows)
logger.info(f"Added {len(rows)} Boss companies to pending.")
async def _insert_pending(self, client, rows: List[Dict[str, Any]]):
if not rows:
return
data: List[List[Any]] = []
for r in rows:
data.append(
[
r["source"],
r["company_id"],
r["company_name"],
r["status"],
"",
r["created_at"],
r["updated_at"],
1,
]
)
await client.insert(
"job_data.pending_company",
data,
column_names=[
"source",
"company_id",
"company_name",
"status",
"error_msg",
"created_at",
"updated_at",
"version",
],
# 用 OFFSET 跳过已知公司数量,获取新公司
offset = len(exclude_ids)
result = await self._query_candidate_rows(
client=client,
table=config["job_table"],
company_id_expr=config["company_id_expr"],
company_name_expr=config["company_name_expr"],
days_back=config["days_back"],
limit=limit,
max_query_limit=config["max_query_limit"],
offset=offset,
)
if not result.result_rows:
logger.info(f"No new {source} companies found in job table query.")
return {
"source": source,
"query_count": 0,
"deduped_count": 0,
"existing_count": len(all_existing),
"queued_count": len(all_queued),
"created_count": 0,
}
deduped_candidates: list[dict[str, str]] = []
seen_ids: set[str] = set()
for raw_company_id, company_name in result.result_rows:
company_id = normalize_company_id(source, raw_company_id)
if not company_id or company_id in seen_ids or company_id in exclude_ids:
continue
seen_ids.add(company_id)
deduped_candidates.append(
{
"company_id": company_id,
"company_name": (company_name or "").strip(),
}
)
if len(deduped_candidates) >= limit:
break
created_count = await company_storage.enqueue_companies(source, deduped_candidates)
logger.info(f"Added {created_count} {source} companies to MySQL queue.")
return {
"source": source,
"query_count": len(result.result_rows),
"deduped_count": len(deduped_candidates),
"existing_count": len(all_existing),
"queued_count": len(all_queued),
"created_count": created_count,
}
async def _query_candidate_rows(
self,
*,
client,
table: str,
company_id_expr: str,
company_name_expr: str,
days_back: int,
limit: int,
max_query_limit: Optional[int],
offset: int = 0,
):
current_days = days_back
current_limit = limit * 5
if max_query_limit is not None:
current_limit = min(current_limit, max_query_limit)
last_error: Optional[Exception] = None
for attempt in range(3):
sample_sql = " SAMPLE 0.1" if attempt == 2 else ""
current_offset = offset
if attempt == 1:
current_days = max(1, min(current_days, 3))
current_limit = min(current_limit, max(limit, 50))
current_offset = 0
query = f"""
SELECT DISTINCT
{company_id_expr} AS cid,
{company_name_expr} AS cname
FROM job_data.{table}{sample_sql}
PREWHERE created_at > now() - INTERVAL {current_days} DAY
AND json_data != ''
WHERE {company_id_expr} != ''
LIMIT {current_limit} OFFSET {current_offset}
"""
try:
logger.info(
f"Querying company candidates from {table} "
f"(days={current_days}, limit={current_limit}, attempt={attempt + 1})"
)
return await client.query(query)
except Exception as exc:
last_error = exc
error_str = str(exc).lower()
if "memory" in error_str or "memory_limit" in error_str:
logger.warning(f"Memory-sensitive query retry for {table}: {exc}")
continue
raise
assert last_error is not None
raise last_error
async def process_single_company(
self,
@ -455,73 +210,47 @@ class CompanyCleaner:
company_id: str,
proxy: Optional[str] = None,
max_delay_seconds: int = 5,
) -> Dict[str, Any]:
client = await clickhouse_manager.get_client()
) -> dict[str, Any]:
normalized_id = normalize_company_id(source, company_id)
queue, _ = await company_storage.enqueue_company(source, normalized_id)
if proxy:
self._apply_proxy(proxy)
delay = 0
if max_delay_seconds and max_delay_seconds > 0:
delay = random.randint(1, max_delay_seconds)
if delay > 0:
await asyncio.sleep(delay)
query = f"""
SELECT source, company_id, company_name, version
FROM job_data.pending_company
FINAL
WHERE source = '{source}' AND company_id = '{company_id}'
ORDER BY version DESC
LIMIT 1
"""
result = await client.query(query)
if result.result_rows:
source_value, cid, cname, version = result.result_rows[0]
else:
source_value = source
cid = company_id
cname = ""
version = 0
await asyncio.sleep(random.randint(1, max_delay_seconds))
await company_storage.mark_queue_processing(queue)
try:
success = await self._fetch_and_save(source_value, cid)
status = "done" if success else "failed"
error_msg = "" if success else "Fetch failed"
except Exception as e:
logger.error(f"Error processing {source_value} {cid}: {e}")
status = "failed"
error_msg = str(e)
await client.insert(
"job_data.pending_company",
[
[
source_value,
cid,
cname,
status,
error_msg.replace("'", "''"),
datetime.now(),
datetime.now(),
int(version) + 1,
]
],
column_names=[
"source",
"company_id",
"company_name",
"status",
"error_msg",
"created_at",
"updated_at",
"version",
],
)
return {
"success": status == "done",
"source": source_value,
"company_id": cid,
"company_name": cname,
"status": status,
"error_msg": error_msg,
"version": int(version) + 1,
}
persist_result = await self._fetch_and_save(source, normalized_id)
jobs_result = await self._sync_company_jobs(source, normalized_id)
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
queue.company_name = persist_result["company_name"]
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
return {
"success": True,
"source": source,
"company_id": normalized_id,
"company_name": persist_result["company_name"],
"status": "done",
"error_msg": "",
"created": persist_result["created"],
"jobs_summary": jobs_result,
}
except Exception as exc:
logger.error(f"Error processing {source} {normalized_id}: {exc}")
await company_storage.mark_queue_result(
queue,
status="failed",
error_msg=str(exc),
increment_retry=True,
)
return {
"success": False,
"source": source,
"company_id": normalized_id,
"company_name": queue.company_name or "",
"status": "failed",
"error_msg": str(exc),
}
async def process_pending_companies(
self,
@ -530,116 +259,81 @@ class CompanyCleaner:
proxy: Optional[str] = None,
max_delay_seconds: int = 0,
):
client = await clickhouse_manager.get_client()
logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
if proxy:
self._apply_proxy(proxy)
where_clause = "WHERE status = 'pending'"
query = CompanyCleaningQueue.filter(status="pending")
if source:
where_clause += f" AND source = '{source}'"
query = f"""
SELECT source, company_id, company_name, version
FROM job_data.pending_company
FINAL
{where_clause}
ORDER BY created_at ASC
LIMIT {limit}
"""
result = await client.query(query)
if not result.result_rows:
query = query.filter(source=source)
queue_rows = await query.order_by("created_at").limit(limit)
if not queue_rows:
logger.info("No pending companies to process.")
return
for source_value, cid, cname, version in result.result_rows:
logger.info(f"Processing {source_value} company: {cname} ({cid})")
for queue in queue_rows:
logger.info(f"Processing {queue.source} company: {queue.company_name or ''} ({queue.company_id})")
await company_storage.mark_queue_processing(queue)
try:
if max_delay_seconds and max_delay_seconds > 0:
delay = random.randint(1, max_delay_seconds)
if delay > 0:
await asyncio.sleep(delay)
success = await self._fetch_and_save(source_value, cid)
status = "done" if success else "failed"
error_msg = "" if success else "Fetch failed"
except Exception as e:
logger.error(f"Error processing {source_value} {cid}: {e}")
status = "failed"
error_msg = str(e)
await client.insert(
"job_data.pending_company",
[
[
source_value,
cid,
cname,
status,
error_msg.replace("'", "''"),
datetime.now(),
datetime.now(),
int(version) + 1,
]
],
column_names=[
"source",
"company_id",
"company_name",
"status",
"error_msg",
"created_at",
"updated_at",
"version",
],
)
await asyncio.sleep(random.randint(1, max_delay_seconds))
persist_result = await self._fetch_and_save(queue.source, queue.company_id)
jobs_result = await self._sync_company_jobs(queue.source, queue.company_id)
logger.info(
f"Synced {queue.source} company jobs: "
f"fetched={jobs_result['jobs_fetched']} stored={jobs_result['stored_success']} "
f"duplicate={jobs_result['duplicate']} failed={jobs_result['failed']}"
)
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
queue.company_name = persist_result["company_name"]
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
except Exception as exc:
logger.error(f"Error processing {queue.source} {queue.company_id}: {exc}")
await company_storage.mark_queue_result(
queue,
status="failed",
error_msg=str(exc),
increment_retry=True,
)
async def _fetch_and_save(self, source: str, company_id: str) -> bool:
data: Optional[Dict[str, Any]] = None
target_table = ""
if source == "zhilian":
data = self.zhilian_service.get_company_detail(company_id)
target_table = "zhilian_company"
elif source == "qcwy":
data = self.qcwy_service.get_company_info(company_id)
target_table = "qcwy_company"
elif source == "boss":
await self._ensure_boss_token_loaded()
data = self.boss_service.get_company_detail_by_id(company_id)
target_table = "boss_company"
async def _fetch_and_save(self, source: str, company_id: str) -> dict[str, Any]:
data = await self._fetch_company_data(source, company_id)
if not data:
logger.error(f"No data returned from source={source} company_id={company_id}")
return False
try:
logger.info(
f"Raw company data from source={source} company_id={company_id}: "
f"{json.dumps(data, ensure_ascii=False)[:2000]}"
)
except Exception as e:
logger.error(f"Failed to log raw company data for source={source} company_id={company_id}: {e}")
client = await clickhouse_manager.get_client()
name = ""
if source == "zhilian":
name = data.get("companyBase", {}).get("companyName", "")
elif source == "qcwy":
name = data.get("companyName", "")
elif source == "boss":
name = data.get("name", "")
json_str = json.dumps(data, ensure_ascii=False)
await client.insert(
f"job_data.{target_table}",
[[0, json_str, name, datetime.now(), datetime.now()]],
column_names=["id", "json_data", "company_name", "created_at", "updated_at"],
)
return True
raise ValueError(f"No data returned from source={source} company_id={company_id}")
return await company_storage.upsert_company(source, data, company_id=company_id)
async def _sync_company_jobs(self, source: str, company_id: str) -> dict[str, Any]:
try:
return await self.company_jobs_sync.sync_company_jobs(source, company_id)
except Exception as exc:
logger.warning(f"Sync company jobs failed for {source} {company_id}: {exc}")
return {
"success": False,
"source": source,
"company_id": company_id,
"jobs_fetched": 0,
"stored_success": 0,
"duplicate": 0,
"failed": 0,
"error": str(exc),
}
async def _fetch_company_data(self, source: str, company_id: str) -> dict[str, Any]:
if source == "zhilian":
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, company_id)
return data or {}
if source == "qcwy":
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
return data or {}
if source == "boss":
await self._ensure_boss_token_loaded()
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, company_id)
return data or {}
raise ValueError(f"unsupported source: {source}")
async def cleanup_old_records(self):
""" 清理已完成或失败的记录 (每日调用) """
client = await clickhouse_manager.get_client()
logger.info("Starting cleanup of processed pending companies...")
# ClickHouse mutations are async, but lightweight for this purpose
query = "ALTER TABLE job_data.pending_company DELETE WHERE status IN ('done', 'failed')"
try:
await client.command(query)
logger.info("Cleanup command executed successfully.")
except Exception as e:
logger.error(f"Cleanup failed: {e}")
await CompanyCleaningQueue.filter(status__in=["done", "failed"]).delete()
company_cleaner = CompanyCleaner()

View File

@ -0,0 +1,355 @@
from __future__ import annotations
from datetime import datetime
from typing import Any, Dict, Iterable, Optional, Type
from app.models.company import (
BaseCompanyModel,
BossCompany,
CompanyCleaningQueue,
QcwyCompany,
ZhilianCompany,
)
COMPANY_SOURCES = {"boss", "qcwy", "zhilian"}
QUEUE_TERMINAL_STATUSES = {"done", "failed"}
def normalize_company_id(source: str, company_id: str) -> str:
value = str(company_id or "").strip()
if source == "qcwy" and value.lower().startswith("co") and value[2:].isdigit():
return value[2:]
return value
def _pick_first(data: dict[str, Any], *keys: str) -> Optional[Any]:
for key in keys:
value = data.get(key)
if value not in (None, ""):
return value
return None
def _nested_get(data: dict[str, Any], *path: str) -> Any:
current: Any = data
for key in path:
if not isinstance(current, dict):
return None
current = current.get(key)
return current
def _clean_text(value: Any) -> Optional[str]:
if value is None:
return None
text = str(value).strip()
return text or None
def _model_for_source(source: str) -> Type[BaseCompanyModel]:
mapping: dict[str, Type[BaseCompanyModel]] = {
"boss": BossCompany,
"qcwy": QcwyCompany,
"zhilian": ZhilianCompany,
}
if source not in mapping:
raise ValueError(f"unsupported source: {source}")
return mapping[source]
def _extract_boss_fields(raw: dict[str, Any], company_id: str) -> dict[str, Any]:
payload = raw.get("zpData") if isinstance(raw.get("zpData"), dict) else raw
brand = payload.get("brandComInfoVO") or {}
company_full = payload.get("companyFullInfoVO") or {}
return {
"source_company_id": normalize_company_id("boss", company_id or _pick_first(brand, "encryptBrandId", "brandId")),
"company_name": _clean_text(
_pick_first(payload, "name")
or _pick_first(company_full, "name", "brandName")
or _pick_first(brand, "brandName")
) or "",
"company_type": _clean_text(_pick_first(company_full, "typeName") or _pick_first(brand, "brandIndustry")),
"industry": _clean_text(_pick_first(brand, "industryName") or _pick_first(company_full, "industry")),
"company_size": _clean_text(_pick_first(brand, "scaleName") or _pick_first(company_full, "scaleName")),
"financing_stage": _clean_text(_pick_first(brand, "stageName") or _pick_first(company_full, "stageName")),
"city": _clean_text(_pick_first(company_full, "cityName", "city")),
"address": _clean_text(_pick_first(company_full, "address", "addressInfo")),
"website": _clean_text(_pick_first(company_full, "website")),
"logo_url": _clean_text(_pick_first(company_full, "logo", "brandLogo") or _pick_first(brand, "logo", "brandLogo")),
"description": _clean_text(
_pick_first(company_full, "introduce", "introduction", "companyDesc")
or _pick_first(brand, "introduce")
),
}
def _extract_qcwy_fields(raw: dict[str, Any], company_id: str) -> dict[str, Any]:
financing = raw.get("financingStage") or {}
coinfo = raw.get("coinfo") if isinstance(raw.get("coinfo"), dict) else {}
return {
"source_company_id": normalize_company_id(
"qcwy",
company_id or _pick_first(raw, "companyId", "coId") or _nested_get(raw, "coinfo", "coid"),
),
"company_name": _clean_text(
_pick_first(raw, "companyName", "fullCompanyName", "companyNameEn")
or _pick_first(coinfo, "coname", "brandName")
) or "",
"company_type": _clean_text(_pick_first(raw, "companyTypeString", "orgTypeName") or _pick_first(coinfo, "cotype")),
"industry": _clean_text(
_pick_first(raw, "industryName", "companyIndustryType1Str")
or _pick_first(coinfo, "indtype1", "indtype2", "coIndustryText")
),
"company_size": _clean_text(
_pick_first(raw, "companySizeString", "companySize", "orgSizeName")
or _pick_first(coinfo, "cosize")
),
"financing_stage": _clean_text(_pick_first(financing, "name") or _pick_first(raw, "financingStageName")),
"city": _clean_text(_pick_first(raw, "cityName", "jobAreaString", "workCity") or _pick_first(coinfo, "areaString")),
"address": _clean_text(
_pick_first(raw, "address", "location")
or _nested_get(raw, "workLocation", "workAddress")
or _pick_first(coinfo, "caddr")
),
"website": _clean_text(_pick_first(raw, "companyUrl", "companyHref") or _pick_first(coinfo, "webUrl")),
"logo_url": _clean_text(_pick_first(raw, "companyLogo") or _pick_first(coinfo, "logourl")),
"description": _clean_text(
_pick_first(raw, "companyDesc", "company_desc", "description")
or _nested_get(raw, "campusRootOrgInfo", "description")
or _pick_first(coinfo, "coinfo")
),
}
def _extract_zhilian_fields(raw: dict[str, Any], company_id: str) -> dict[str, Any]:
data = raw.get("data") if isinstance(raw.get("data"), dict) else raw
company_base = data.get("companyBase") or {}
detailed_company = data.get("detailedCompany") or {}
return {
"source_company_id": normalize_company_id(
"zhilian",
company_id
or _pick_first(company_base, "companyNumber", "number")
or _pick_first(detailed_company, "companyNumber", "number"),
),
"company_name": _clean_text(_pick_first(company_base, "companyName") or _pick_first(data, "companyName")) or "",
"company_type": _clean_text(
_pick_first(company_base, "companyTypeName", "companyType")
or _pick_first(detailed_company, "companyTypeName")
),
"industry": _clean_text(_pick_first(company_base, "industryName") or _pick_first(detailed_company, "industryName")),
"company_size": _clean_text(
_pick_first(company_base, "companySize", "companySizeString")
or _pick_first(detailed_company, "companySize")
),
"financing_stage": _clean_text(
_pick_first(company_base, "financingStageName")
or _nested_get(company_base, "financingStage", "name")
or _nested_get(detailed_company, "financingStage", "name")
),
"city": _clean_text(_pick_first(company_base, "cityName") or _pick_first(detailed_company, "cityName")),
"address": _clean_text(_pick_first(company_base, "address") or _pick_first(detailed_company, "address")),
"website": _clean_text(_pick_first(company_base, "companyUrl", "website")),
"logo_url": _clean_text(_pick_first(company_base, "logoUrl", "companyLogo")),
"description": _clean_text(
_pick_first(company_base, "companyDescWithHtml", "companyDesc")
or _pick_first(detailed_company, "companyDescription", "companyDesc")
),
}
def extract_company_fields(source: str, raw: dict[str, Any], company_id: str) -> dict[str, Any]:
if source == "boss":
return _extract_boss_fields(raw, company_id)
if source == "qcwy":
return _extract_qcwy_fields(raw, company_id)
if source == "zhilian":
return _extract_zhilian_fields(raw, company_id)
raise ValueError(f"unsupported source: {source}")
class CompanyStorageService:
@staticmethod
def company_model(source: str) -> Type[BaseCompanyModel]:
return _model_for_source(source)
async def get_existing_company_ids(self, source: str, company_ids: Iterable[str]) -> set[str]:
normalized_ids = [normalize_company_id(source, item) for item in company_ids if item]
if not normalized_ids:
return set()
model = self.company_model(source)
rows = await model.filter(source_company_id__in=normalized_ids).values_list("source_company_id", flat=True)
return set(rows)
async def get_all_company_ids(self, source: str) -> set[str]:
"""获取该平台所有已入库的公司 ID用于 ClickHouse 查询排除)"""
model = self.company_model(source)
rows = await model.all().values_list("source_company_id", flat=True)
return set(rows)
async def get_existing_queue_ids(self, source: str, company_ids: Iterable[str]) -> set[str]:
normalized_ids = [normalize_company_id(source, item) for item in company_ids if item]
if not normalized_ids:
return set()
rows = await CompanyCleaningQueue.filter(source=source, company_id__in=normalized_ids).values_list("company_id", flat=True)
return set(rows)
async def enqueue_company(self, source: str, company_id: str, company_name: str = "") -> tuple[CompanyCleaningQueue, bool]:
normalized_id = normalize_company_id(source, company_id)
defaults = {
"company_name": company_name or "",
"status": "pending",
"error_msg": "",
"retry_count": 0,
"started_at": None,
"finished_at": None,
"jobs_fetched": 0,
"jobs_stored": 0,
"jobs_duplicate": 0,
"jobs_failed": 0,
"jobs_error_msg": "",
}
queue, created = await CompanyCleaningQueue.get_or_create(
source=source,
company_id=normalized_id,
defaults=defaults,
)
if not created and company_name and queue.company_name != company_name:
queue.company_name = company_name
await queue.save(update_fields=["company_name", "updated_at"])
return queue, created
async def enqueue_companies(self, source: str, companies: Iterable[dict[str, str]]) -> int:
created_count = 0
for item in companies:
_, created = await self.enqueue_company(
source=source,
company_id=item.get("company_id", ""),
company_name=item.get("company_name", "") or "",
)
if created:
created_count += 1
return created_count
async def get_company_record(self, source: str, company_id: str) -> Optional[BaseCompanyModel]:
normalized_id = normalize_company_id(source, company_id)
model = self.company_model(source)
return await model.get_or_none(source_company_id=normalized_id)
async def upsert_company(
self,
source: str,
raw_data: dict[str, Any],
*,
company_id: Optional[str] = None,
) -> dict[str, Any]:
normalized_id = normalize_company_id(source, company_id or "")
fields = extract_company_fields(source, raw_data, normalized_id)
normalized_id = fields["source_company_id"]
if not normalized_id:
raise ValueError(f"missing normalized company id for source={source}")
if not fields["company_name"]:
raise ValueError(f"missing company name for source={source} company_id={normalized_id}")
model = self.company_model(source)
record = await model.get_or_none(source_company_id=normalized_id)
now = datetime.now()
payload = {
**fields,
"raw_json": raw_data,
"last_crawled_at": now,
}
if record:
for key, value in payload.items():
setattr(record, key, value)
await record.save()
created = False
else:
record = await model.create(
**payload,
first_crawled_at=now,
)
created = True
return {
"success": True,
"created": created,
"company_id": normalized_id,
"company_name": record.company_name,
"data_summary": {
"source": source,
"company_id": normalized_id,
"company_name": record.company_name,
"created": created,
},
"record": record,
}
async def mark_queue_processing(self, queue: CompanyCleaningQueue) -> None:
queue.status = "processing"
queue.error_msg = ""
queue.started_at = datetime.now()
queue.finished_at = None
queue.jobs_fetched = 0
queue.jobs_stored = 0
queue.jobs_duplicate = 0
queue.jobs_failed = 0
queue.jobs_error_msg = ""
await queue.save(
update_fields=[
"status",
"error_msg",
"started_at",
"finished_at",
"jobs_fetched",
"jobs_stored",
"jobs_duplicate",
"jobs_failed",
"jobs_error_msg",
"updated_at",
]
)
async def mark_queue_result(
self,
queue: CompanyCleaningQueue,
*,
status: str,
error_msg: str = "",
increment_retry: bool = False,
jobs_summary: Optional[dict[str, Any]] = None,
) -> None:
queue.status = status
queue.error_msg = error_msg or ""
queue.finished_at = datetime.now()
if jobs_summary:
queue.jobs_fetched = int(jobs_summary.get("jobs_fetched") or 0)
queue.jobs_stored = int(jobs_summary.get("stored_success") or 0)
queue.jobs_duplicate = int(jobs_summary.get("duplicate") or 0)
queue.jobs_failed = int(jobs_summary.get("failed") or 0)
queue.jobs_error_msg = jobs_summary.get("error") or ""
if increment_retry:
queue.retry_count += 1
await queue.save(
update_fields=[
"company_name",
"status",
"error_msg",
"retry_count",
"finished_at",
"jobs_fetched",
"jobs_stored",
"jobs_duplicate",
"jobs_failed",
"jobs_error_msg",
"updated_at",
]
)
company_storage = CompanyStorageService()

View File

@ -1,3 +1,3 @@
from .boss import BossService
from .qcwy import QcwyService
from .zhilian import ZhilianService
from .boss import BossService as BossService
from .qcwy import QcwyService as QcwyService
from .zhilian import ZhilianService as ZhilianService

View File

@ -12,10 +12,33 @@ from __future__ import annotations
from typing import Any, Optional
from crawler_core.base import BaseFetcher, BaseSearcher
from crawler_core.base import BaseFetcher, BaseSearcher, Result
from app.services.crawler._zhilian_client import ZhilianClient, create_cgate_client, create_capi_client
def _parse_zhilian_response(http_code: int, raw) -> Result:
"""智联专用响应解析"""
if http_code != 200:
return Result(success=False, status_code=http_code,
error=f"HTTP 请求失败: {http_code}")
if not isinstance(raw, dict):
return Result(success=False, status_code=http_code, error="响应格式异常")
payload = raw.get("data") or {}
# 列表型响应
if isinstance(payload, dict) and "list" in payload:
items = payload.get("list", [])
num_found = raw.get("pageInfo", {}).get("numFound", 0) or payload.get("numFound", len(items))
return Result(
success=True, status_code=200, data=payload,
list=items, count=num_found,
is_end_page=len(items) == 0,
)
return Result(success=True, status_code=200, data=payload)
_SEARCH_BODY = {
"eventScenario": "wxmpZhaopinSearchV2",
"filterMinSalary": 1,
@ -49,6 +72,9 @@ class SearchPositions(BaseSearcher):
self.collected_purpose = collected_purpose
self.filters = filters or {}
def _parse(self, http_code: int, raw) -> Result:
return _parse_zhilian_response(http_code, raw)
def _build_params(self, page_index: int) -> dict:
body = {**_SEARCH_BODY, "pageIndex": page_index, "pageSize": self.page_size}
if self.collected_purpose:
@ -90,6 +116,9 @@ class GetPositionDetail(BaseFetcher):
def _build_params(self) -> dict:
return {"number": self.number, "identity": self.identity, "resumeNumber": ""}
def _parse(self, http_code: int, raw) -> Result:
return _parse_zhilian_response(http_code, raw)
class GetCompanyExtDetail(BaseFetcher):
ENDPOINT = "/riskstorm/company/getCompanyExtDetail"
@ -102,6 +131,9 @@ class GetCompanyExtDetail(BaseFetcher):
def _build_params(self) -> dict:
return {"companyName": self.company_name, "companyNumber": self.company_number}
def _parse(self, http_code: int, raw) -> Result:
return _parse_zhilian_response(http_code, raw)
class GetCompanyDetail(BaseFetcher):
ENDPOINT = "/positionbusiness/exposure/companyDetail"
@ -113,6 +145,9 @@ class GetCompanyDetail(BaseFetcher):
def _build_params(self) -> dict:
return {"number": self.number}
def _parse(self, http_code: int, raw) -> Result:
return _parse_zhilian_response(http_code, raw)
class SearchCompanyPositions(BaseSearcher):
ENDPOINT = "/capi/searchrecommend/searchPositionsCompany"
@ -146,3 +181,6 @@ class SearchCompanyPositions(BaseSearcher):
def _request(self, params: dict) -> tuple[int, Any]:
return self.http_client.get(self.ENDPOINT, params)
def _parse(self, http_code: int, raw) -> Result:
return _parse_zhilian_response(http_code, raw)

View File

@ -0,0 +1,5 @@
# 触发所有平台配置注册
from app.services.ingest.configs import * # noqa: F401, F403
from app.services.ingest.service import IngestService
__all__ = ["IngestService"]

View File

@ -0,0 +1,4 @@
# 导入各平台配置,触发注册
from app.services.ingest.configs import boss # noqa: F401
from app.services.ingest.configs import qcwy # noqa: F401
from app.services.ingest.configs import zhilian # noqa: F401

View File

@ -0,0 +1,53 @@
from dataclasses import dataclass, field
from typing import Callable, Dict, Any, List, Optional, Tuple
from app.log import logger
@dataclass(frozen=True)
class DedupFieldSpec:
"""去重字段规格:从原始数据中提取去重列值"""
column: str
extractor: Callable[[Dict[str, Any]], Optional[str]]
@dataclass(frozen=True)
class PlatformConfig:
"""平台配置(不可变)"""
platform: str
channel: str
data_type: str
table: str
dedup_fields: Tuple[DedupFieldSpec, ...] = ()
push_mapper: Optional[Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]] = None
@property
def key(self) -> Tuple[str, str, str]:
return (self.platform, self.channel, self.data_type)
@property
def dedup_columns(self) -> List[str]:
return [f.column for f in self.dedup_fields]
# 全局注册表
_REGISTRY: Dict[Tuple[str, str, str], PlatformConfig] = {}
def register(config: PlatformConfig) -> None:
key = config.key
if key in _REGISTRY:
logger.warning(f"覆盖已有注册: {key}")
_REGISTRY[key] = config
def get_config(platform: str, channel: str, data_type: str) -> PlatformConfig:
key = (platform, channel, data_type)
config = _REGISTRY.get(key)
if config is None:
raise ValueError(f"未注册的平台配置: platform={platform}, channel={channel}, data_type={data_type}")
return config
def list_configs() -> List[PlatformConfig]:
return list(_REGISTRY.values())

View File

@ -0,0 +1,83 @@
import hashlib
import time
from typing import Dict, Any, List, Optional
import httpx
from app.log import logger
# 同步辅助函数(无 await纯计算
def safe_get(obj: Optional[Dict], key: str, default: str = "") -> str:
if obj is None:
return default
value = obj.get(key)
return str(value) if value is not None else default
def safe_join(data, default: str = "") -> str:
if data is None:
return default
if isinstance(data, list):
return ",".join(str(item) for item in data if item)
return str(data) if data else default
# 模块级 httpx 单例
_http_client: Optional[httpx.AsyncClient] = None
def get_http_client() -> httpx.AsyncClient:
global _http_client
if _http_client is None or _http_client.is_closed:
_http_client = httpx.AsyncClient(timeout=30.0)
return _http_client
async def close_http_client() -> None:
global _http_client
if _http_client is not None and not _http_client.is_closed:
await _http_client.aclose()
_http_client = None
def _build_auth_url() -> str:
from_id = 9910056
timestamp = int(time.time())
salt = "jWcIqJK6QlR2syb6HQgpel9iOoOkj01G5MDFNtQLaTxhddHUTEnURsMe2RxCTYC8"
token = hashlib.md5((salt + str(timestamp)).encode()).hexdigest()
return f"http://external-data.qixin.com/extend/extend_data_push?from={from_id}&token={token}&time={timestamp}"
_PUSH_HEADERS = {
"Content-Type": "application/json",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
}
async def push_to_remote(data: Dict[str, Any]) -> bool:
source_type = data.get("source_type", "未知平台")
title = data.get("title", "未知职位")
company = data.get("company_name", data.get("name", "未知公司"))
logger.info(f"上报数据: [{source_type}] {title} - {company}")
print(data)
# try:
# url = _build_auth_url()
# client = get_http_client()
# response = await client.post(url, json=data, headers=_PUSH_HEADERS)
# if response.status_code == 200:
# return True
# logger.error(f"数据发送失败: {response.status_code} - {response.text[:100]}")
# return False
# except Exception as e:
# logger.error(f"发送异常: {e}")
# return False
async def batch_push_to_remote(data_list: List[Dict[str, Any]]) -> None:
for data in data_list:
try:
await push_to_remote(data)
except Exception as e:
logger.error(f"批量推送单条失败: {e}")

View File

@ -1,108 +0,0 @@
import json
from datetime import datetime
from typing import Dict, Any, List, Optional, Tuple
from clickhouse_connect.driver import AsyncClient
class IngestService:
def __init__(self, client: AsyncClient):
self.client = client
def _table_name(self, platform: str, data_type: str) -> str:
return f"job_data.{platform}_{data_type}"
def _build_row(self, platform: str, data_type: str, data: Dict[str, Any]) -> Tuple[List[str], List[Any]]:
now = datetime.now()
columns = ["id", "json_data", "created_at", "updated_at"]
values = [0, json.dumps(data, ensure_ascii=False), now, now]
if platform == "boss" and data_type == "job":
job_base = data.get("jobBaseInfoVO", {})
columns += ["job_id"]
values += [str(job_base.get("jobId", ""))]
elif platform == "qcwy" and data_type == "job":
columns += ["job_id", "update_date_time"]
values += [str(data.get("jobId", "")), str(data.get("updateDateTime", ""))]
elif platform == "zhilian" and data_type == "job":
columns += ["number", "first_publish_time"]
values += [str(data.get("number", "")), str(data.get("firstPublishTime", ""))]
elif data_type == "company":
name = None
if platform == "boss":
name = data.get("brandComInfoVO", {}).get("brandName") or data.get("name")
elif platform == "qcwy":
name = data.get("fullCompanyName") or data.get("companyName")
elif platform == "zhilian":
name = data.get("companyName") or data.get("name")
columns += ["company_name"]
values += [str(name or "")]
return columns, values
def _dup_conditions(self, platform: str, data_type: str, data: Dict[str, Any]) -> Optional[Tuple[str, List[Any]]]:
if platform == "boss" and data_type == "job":
job_base = data.get("jobBaseInfoVO", {})
job_id = job_base.get("jobId")
if not job_id:
return None
return "job_id = %s", [str(job_id)]
if platform == "qcwy" and data_type == "job":
job_id = data.get("jobId")
update_dt = data.get("updateDateTime")
if not job_id or not update_dt:
return None
return "job_id = %s AND update_date_time = %s", [str(job_id), str(update_dt)]
if platform == "zhilian" and data_type == "job":
number = data.get("number")
fpt = data.get("firstPublishTime")
if not number or not fpt:
return None
return "number = %s AND first_publish_time = %s", [str(number), str(fpt)]
if data_type == "company":
name = None
if platform == "boss":
name = data.get("brandComInfoVO", {}).get("brandName") or data.get("name")
elif platform == "qcwy":
name = data.get("fullCompanyName") or data.get("companyName")
elif platform == "zhilian":
name = data.get("companyName") or data.get("name")
if not name:
return None
return "company_name = %s", [str(name)]
return None
async def store_single(self, platform: str, data_type: str, data: Dict[str, Any], check_duplicate: bool = True) -> Dict[str, int]:
table = self._table_name(platform, data_type)
if check_duplicate:
cond = self._dup_conditions(platform, data_type, data)
if cond:
where_sql, params = cond
q = f"SELECT 1 FROM {table} WHERE {where_sql} LIMIT 1"
r = await self.client.query(q, params)
if r.result_rows:
return {"inserted": 0, "ignored": 1}
cols, vals = self._build_row(platform, data_type, data)
await self.client.insert(table, [vals], column_names=cols)
return {"inserted": 1, "ignored": 0}
async def store_batch(self, platform: str, data_type: str, data_list: List[Dict[str, Any]], check_duplicate: bool = True) -> Dict[str, int]:
table = self._table_name(platform, data_type)
if not data_list:
return {"inserted": 0, "ignored": 0}
rows: List[List[Any]] = []
columns: Optional[List[str]] = None
ignored = 0
for d in data_list:
if check_duplicate:
cond = self._dup_conditions(platform, data_type, d)
if cond:
where_sql, params = cond
q = f"SELECT 1 FROM {table} WHERE {where_sql} LIMIT 1"
r = await self.client.query(q, params)
if r.result_rows:
ignored += 1
continue
cols, vals = self._build_row(platform, data_type, d)
columns = columns or cols
rows.append(vals)
if rows:
await self.client.insert(table, rows, column_names=columns)
return {"inserted": len(rows), "ignored": ignored}

View File

@ -1,971 +0,0 @@
import hashlib
import time
from typing import Dict, Any, Optional, List
from enum import Enum
import json
from datetime import datetime
import requests
from clickhouse_connect.driver import AsyncClient
from app.log import logger
from tenacity import retry, stop_after_attempt, wait_exponential
class DataType(str, Enum):
"""数据类型枚举"""
JOB = "job"
COMPANY = "company"
class PlatformType(str, Enum):
"""平台类型枚举"""
BOSS = "boss"
QCWY = "qcwy"
ZHILIAN = "zhilian"
class DataRouterService:
"""通用数据路由服务 - 根据数据类型和平台自动选择对应的表进行存储"""
def __init__(self, clickhouse_client: AsyncClient):
self.clickhouse_client = clickhouse_client
# 移除平台特定仓库引用,改用通用数据接口
# 安全获取列表数据的辅助函数
async def safe_join(self, data, default=""):
"""安全地将列表数据转换为逗号分隔的字符串"""
if data is None:
return default
if isinstance(data, list):
return ",".join(str(item) for item in data if item)
return str(data) if data else default
# 安全获取字符串数据的辅助函数
async def safe_get(self, obj, key, default=""):
"""安全地获取字典中的值"""
value = obj.get(key) if obj else None
return str(value) if value is not None else default
async def store_data(self,
data: Dict[str, Any],
data_type: DataType,
platform: PlatformType,
check_duplicate: bool = True) -> Dict[str, Any]:
"""通用数据存储方法 - 使用JSON存储方案
Args:
data: 要存储的数据
data_type: 数据类型 (job/company)
platform: 平台类型 (boss/qcwy/zhilian)
check_duplicate: 是否检查重复数据
Returns:
存储结果信息
"""
try:
return await self._store_data_as_json(data, data_type, platform, check_duplicate)
except Exception as e:
logger.error(f"{platform} {data_type} 数据存储失败: {str(e)}")
return {
"success": False,
"message": f"数据存储失败: {str(e)}",
"duplicate": False,
"error": str(e)
}
def _get_json_table_name(self, data_type: DataType, platform: PlatformType) -> str:
"""根据数据类型和平台获取对应的JSON表名"""
return f"{platform.value}_{data_type.value}"
async def _store_data_as_json(self, data: Dict[str, Any], data_type: DataType, platform: PlatformType,
check_duplicate: bool = True) -> Dict[str, Any]:
"""使用JSON存储方案存储数据"""
try:
# 获取对应的JSON表名
json_table_name = self._get_json_table_name(data_type, platform)
remote_data = await self._prepare_remote_push_data(data, data_type, platform)
if remote_data:
await self.send_to_remote_server(remote_data)
# QCWY平台重复检查
if platform == PlatformType.QCWY and data_type == DataType.JOB:
job_id = data.get('jobId')
update_date_time = data.get('updateDateTime')
if job_id and update_date_time:
duplicate_record = await self._check_qcwy_duplicate(json_table_name, job_id, update_date_time)
if duplicate_record:
logger.info(f"QCWY职位数据重复跳过插入: jobId={job_id}, updateDateTime={update_date_time}")
return {
"success": True,
"message": "数据重复,跳过插入",
"duplicate": True,
"table_name": json_table_name,
"storage_type": "json"
}
# BOSS平台重复检查: JonId
if platform == PlatformType.BOSS and data_type == DataType.JOB:
job_base_info = data.get('jobBaseInfoVO', {})
job_id = job_base_info.get('jobId')
if job_id:
duplicate_record = await self._check_boss_duplicate(json_table_name, job_id)
if duplicate_record:
logger.info(f"BOSS职位数据重复跳过插入: jobId={job_id}")
return {
"success": True,
"message": "数据重复,跳过插入",
"duplicate": True,
"table_name": json_table_name,
"storage_type": "json"
}
# 智联平台重复检查: number + firstPublishTime
if platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
number = data.get('number')
first_publish_time = data.get('firstPublishTime')
if number and first_publish_time:
duplicate_record = await self._check_zhilian_duplicate(json_table_name, number, first_publish_time)
if duplicate_record:
logger.info(
f"智联职位数据重复,跳过插入: number={number}, firstPublishTime={first_publish_time}")
return {
"success": True,
"message": "数据重复,跳过插入",
"duplicate": True,
"table_name": json_table_name,
"storage_type": "json"
}
# BOSS平台公司重复检查: 按公司名称
if platform == PlatformType.BOSS and data_type == DataType.COMPANY:
company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
if company_name:
duplicate_record = await self._check_boss_company_duplicate(json_table_name, company_name)
if duplicate_record:
logger.info(f"BOSS公司数据重复跳过插入: companyName={company_name}")
return {
"success": True,
"message": "数据重复,跳过插入",
"duplicate": True,
"table_name": json_table_name,
"storage_type": "json"
}
# QCWY平台公司重复检查: 按公司名称
if platform == PlatformType.QCWY and data_type == DataType.COMPANY:
company_name = data.get('companyName') or data.get('company_name')
if company_name:
duplicate_record = await self._check_qcwy_company_duplicate(json_table_name, company_name)
if duplicate_record:
logger.info(f"QCWY公司数据重复跳过插入: companyName={company_name}")
return {
"success": True,
"message": "数据重复,跳过插入",
"duplicate": True,
"table_name": json_table_name,
"storage_type": "json"
}
# 智联平台公司重复检查: companyName
if platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
company_name = data.get('companyName') or data.get('name')
if company_name:
duplicate_record = await self._check_zhilian_company_duplicate(json_table_name, company_name)
if duplicate_record:
logger.info(f"智联公司数据重复,跳过插入: companyName={company_name}")
return {
"success": True,
"message": "数据重复,跳过插入",
"duplicate": True,
"table_name": json_table_name,
"storage_type": "json"
}
# 准备JSON存储数据
current_time = datetime.now()
json_data = {
'id': 0, # <20><>动生成
'json_data': json.dumps(data, ensure_ascii=False),
'created_at': current_time,
'updated_at': current_time
}
# 根据平台和数据类型添加去重字段
if platform == PlatformType.BOSS and data_type == DataType.JOB:
# BOSS平台职位数据添加job_id字段
job_base_info = data.get('jobBaseInfoVO', {})
if job_base_info and 'jobId' in job_base_info:
json_data['job_id'] = str(job_base_info['jobId'])
elif platform == PlatformType.QCWY and data_type == DataType.JOB:
# QCWY平台职位数据添加job_id和update_date_time字段
if 'jobId' in data:
json_data['job_id'] = str(data['jobId'])
if 'updateDateTime' in data:
json_data['update_date_time'] = str(data['updateDateTime'])
elif platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
# 智联平台职位数据添加number和first_publish_time字段
if 'number' in data:
json_data['number'] = str(data['number'])
if 'firstPublishTime' in data:
json_data['first_publish_time'] = str(data['firstPublishTime'])
# 根据平台和数据类型添加公司去重字段
elif platform == PlatformType.BOSS and data_type == DataType.COMPANY:
# BOSS平台公司数据添加company_name字段
company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
if company_name:
json_data['company_name'] = str(company_name)
elif platform == PlatformType.QCWY and data_type == DataType.COMPANY:
# QCWY平台公司数据添加company_name字段
company_name = data.get('companyName') or data.get('company_name')
if company_name:
json_data['company_name'] = str(company_name)
elif platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
# 智联平台公司数据添加company_name字段
company_name = data.get('companyName') or data.get('name')
if company_name:
json_data['company_name'] = str(company_name)
# 插入到对应的JSON表
await self._insert_data_to_clickhouse(json_table_name, json_data)
logger.info(f"{platform} {data_type} 数据以JSON格式存储成功到表 {json_table_name}")
return {
"success": True,
"message": "JSON数据存储成功",
"duplicate": False,
"table_name": json_table_name,
"storage_type": "json"
}
except Exception as e:
logger.error(f"JSON数据存储失败: {str(e)}")
raise e
async def query_json_data(self,
platform: PlatformType,
data_type: DataType,
json_fields: Optional[Dict[str, str]] = None,
limit: int = 100,
offset: int = 0) -> Dict[str, Any]:
"""查询JSON存储的数据
Args:
platform: 平台类型 (必需)
data_type: 数据类型 (必需)
json_fields: 要提取的JSON字段映射 {alias: json_path}
limit: 返回记录数限制
Returns:
查询结果
"""
try:
# 获取对应的JSON表名
json_table_name = self._get_json_table_name(data_type, platform)
# 获取总数
count_query = f"SELECT count() FROM job_data.{json_table_name}"
count_result = await self.clickhouse_client.query(count_query)
total_count = count_result.result_rows[0][0] if count_result.result_rows else 0
# 构建查询
if json_fields:
select_fields = ['created_at']
for alias, json_path in json_fields.items():
select_fields.append(f"JSONExtractString(json_data, '{json_path}') as {alias}")
query = f"SELECT {', '.join(select_fields)} FROM job_data.{json_table_name}"
else:
# 如果没有指定字段,查询所有字段
query = f"SELECT * FROM job_data.{json_table_name}"
query += f" ORDER BY created_at DESC LIMIT {limit} OFFSET {offset}"
# 执行查询
result = await self.clickhouse_client.query(query)
# 将结果转换为字典列表
data = []
for row in result.result_rows:
item = dict(zip(result.column_names, row))
# 尝试解析json_data
if 'json_data' in item and isinstance(item['json_data'], str):
try:
json_content = json.loads(item['json_data'])
if isinstance(json_content, dict):
item.update(json_content)
except:
pass
data.append(item)
logger.info(f"JSON数据查询成功从表 {json_table_name} 返回 {len(result.result_rows)} 条记录")
return {
"success": True,
"data": data,
"columns": result.column_names,
"count": total_count,
"table_name": json_table_name
}
except Exception as e:
logger.error(f"JSON数据查询失败: {str(e)}")
return {
"success": False,
"message": f"查询失败: {str(e)}",
"error": str(e)
}
async def _insert_data_to_clickhouse(self, table_name: str, data: Dict[str, Any]) -> None:
"""向ClickHouse表插入数据
Args:
table_name: 表名
data: 要插入的数据字典
"""
try:
columns = list(data.keys())
values = [[data.get(col) for col in columns]]
await self.clickhouse_client.insert(f"job_data.{table_name}", values, column_names=columns)
except Exception as e:
logger.error(f"向表 {table_name} 插入数据失败: {str(e)}")
raise e
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_qcwy_duplicate(self, table_name: str, job_id: str, update_date_time: str) -> Optional[
Dict[str, Any]]:
"""检查QCWY平台重复数据 - 基于job_id和update_date_time字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE job_id = {job_id:String}
AND update_date_time = {udt:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"job_id": str(job_id), "udt": str(update_date_time)})
if result.result_rows:
logger.info(f"发现QCWY重复数据: jobId={job_id}, updateDateTime={update_date_time}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1]
}
return None
except Exception as e:
logger.error(f"检查QCWY重复数据失败: {str(e)}")
return None
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_zhilian_duplicate(self, table_name: str, number: str, first_publish_time: str) -> Optional[
Dict[str, Any]]:
"""检查智联平台重复数据 - 基于number和first_publish_time字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE number = {number:String}
AND first_publish_time = {fpt:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"number": str(number), "fpt": str(first_publish_time)})
if result.result_rows:
logger.info(f"发现智联重复数据: number={number}, firstPublishTime={first_publish_time}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1],
"number": number,
"first_publish_time": first_publish_time
}
return None
except Exception as e:
logger.error(f"检查智联重复数据失败: {str(e)}")
return None
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_boss_duplicate(self, table_name: str, job_id: any) -> Optional[Dict[str, Any]]:
"""检查BOSS平台重复数据 - 基于job_id字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE job_id = {job_id:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"job_id": str(job_id)})
if result.result_rows:
logger.info(f"发现BOSS重复数据: jobId={job_id}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1]
}
return None
except Exception as e:
logger.error(f"检查BOSS重复数据失败: {str(e)}")
return None
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_boss_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
"""检查BOSS平台公司重复数据 - 基于company_name字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE company_name = {company_name:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
if result.result_rows:
logger.info(f"发现BOSS公司重复数据: companyName={company_name}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1]
}
return None
except Exception as e:
logger.error(f"检查BOSS公司重复数据失败: {str(e)}")
return None
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_qcwy_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
"""检查QCWY平台公司重复数据 - 基于company_name字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE company_name = {company_name:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
if result.result_rows:
logger.info(f"发现QCWY公司重复数据: companyName={company_name}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1]
}
return None
except Exception as e:
logger.error(f"检查QCWY公司重复数据失败: {str(e)}")
return None
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_qcwy_company_duplicate_by_name(self, table_name: str, company_name: str) -> Optional[
Dict[str, Any]]:
"""检查QCWY平台公司重复数据 - 基于company_name字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE company_name = {company_name:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
if result.result_rows:
logger.info(f"发现QCWY公司重复数据: companyName={company_name}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1]
}
return None
except Exception as e:
logger.error(f"检查QCWY公司重复数据失败: {str(e)}")
return None
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
async def _check_zhilian_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
"""检查智联平台公司重复数据 - 基于company_name字段"""
try:
query = f"""
SELECT id, created_at
FROM job_data.{table_name}
WHERE company_name = {company_name:String}
LIMIT 1
"""
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
if result.result_rows:
logger.info(f"发现智联公司重复数据: companyName={company_name}")
return {
"id": result.result_rows[0][0],
"created_at": result.result_rows[0][1]
}
return None
except Exception as e:
logger.error(f"检查智联公司重复数据失败: {str(e)}")
return None
async def send_to_remote_server(self, data: Dict[str, Any]) -> bool:
"""
发送数据到远程服务器简化版
直接接收body数据并发送
Args:
data: 要发送的数据字典
Returns:
bool: 发送成功返回True失败返回False
"""
# 打印关键词日志
source_type = data.get('source_type', '未知平台')
title = data.get('title', '未知职位')
company_name = data.get('company_name', data.get('name', '未知公司'))
logger.info(f"📤 上报数据: [{source_type}] {title} - {company_name}")
try:
# 构建认证参数
from_id = 9910056
timestamp = int(time.time())
salt = 'jWcIqJK6QlR2syb6HQgpel9iOoOkj01G5MDFNtQLaTxhddHUTEnURsMe2RxCTYC8'
# 生成token
token_string = salt + str(timestamp)
token = hashlib.md5(token_string.encode()).hexdigest()
url = f'http://external-data.qixin.com/extend/extend_data_push?from={from_id}&token={token}&time={timestamp}'
headers = {
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
}
# 直接发送原始数据
response = requests.post(url, json=data, headers=headers, timeout=30)
# print(response.text)
if response.status_code == 200:
return True
else:
logger.error(f"❌ 数据发送失败: {response.status_code} - {response.text[:100]}")
return False
except Exception as e:
logger.error(f"❌ 发送异常: {str(e)}")
return False
async def batch_store_data(self,
data_list: List[Dict[str, Any]],
data_type: DataType,
platform: PlatformType,
check_duplicate: bool = True) -> Dict[str, Any]:
"""批量存储数据 - 优化版本,使用批量插入
Args:
data_list: 要存储的数据列表
data_type: 数据类型 (job/company)
platform: 平台类型 (boss/qcwy/zhilian)
check_duplicate: 是否检查重复数据
Returns:
批量存储结果信息
"""
results = {
"total": len(data_list),
"success": 0,
"failed": 0,
"duplicate": 0,
"errors": []
}
if not data_list:
return results
try:
# 获取表名
json_table_name = self._get_json_table_name(data_type, platform)
# 批量处理数据 - 直接准备插入数据,在插入时处理重复
valid_data_list = []
remote_push_data_list = []
# 第一步:准备所有数据
for i, data in enumerate(data_list):
try:
# 准备插入数据
current_time = datetime.now()
json_data = {
'id': 0, # 自动生成
'json_data': json.dumps(data, ensure_ascii=False),
'created_at': current_time,
'updated_at': current_time
}
# 添加去重字段
self._add_dedup_fields(json_data, data, data_type, platform)
valid_data_list.append(json_data)
# 准备远程推送数据
remote_data = await self._prepare_remote_push_data(data, data_type, platform)
if remote_data:
remote_push_data_list.append(remote_data)
except Exception as e:
results["failed"] += 1
results["errors"].append({
"index": i,
"error": f"数据预处理失败: {str(e)}"
})
# 第二步:批量插入到数据库(在插入时忽略重复数据)
if valid_data_list:
try:
insert_result = await self._batch_insert_to_clickhouse(json_table_name, valid_data_list,
ignore_duplicates=check_duplicate)
results["success"] = insert_result["inserted"]
results["duplicate"] = insert_result["ignored"]
# logger.info(
# f"批量插入完成: {insert_result['inserted']} 条成功, {insert_result['ignored']} 条重复忽略")
except Exception as e:
# 如果批量插入完全失败,记录错误
logger.error(f"批量插入失败: {str(e)}")
results["failed"] = len(valid_data_list)
results["errors"].append({
"error": f"批量插入失败: {str(e)}"
})
# 第三步:批量推送到远程服务器
if remote_push_data_list:
try:
await self._batch_send_to_remote_server(remote_push_data_list)
logger.info(f"批量推送到远程服务器成功: {len(remote_push_data_list)} 条数据")
except Exception as e:
logger.warning(f"批量推送到远程服务器失败: {str(e)}")
# 远程推送失败不影响主要存储结果
except Exception as e:
logger.error(f"批量存储数据失败: {str(e)}")
# 如果批量处理完全失败,回退到原来的逐个处理方式
return await self._fallback_individual_store(data_list, data_type, platform, check_duplicate)
return results
def _add_dedup_fields(self, json_data: Dict[str, Any], data: Dict[str, Any], data_type: DataType,
platform: PlatformType):
"""为JSON数据添加去重字段"""
if platform == PlatformType.BOSS and data_type == DataType.JOB:
job_base_info = data.get('jobBaseInfoVO', {})
if job_base_info and 'jobId' in job_base_info:
json_data['job_id'] = str(job_base_info['jobId'])
elif platform == PlatformType.QCWY and data_type == DataType.JOB:
if 'jobId' in data:
json_data['job_id'] = str(data['jobId'])
if 'updateDateTime' in data:
json_data['update_date_time'] = str(data['updateDateTime'])
elif platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
if 'number' in data:
json_data['number'] = str(data['number'])
if 'firstPublishTime' in data:
json_data['first_publish_time'] = str(data['firstPublishTime'])
elif platform == PlatformType.BOSS and data_type == DataType.COMPANY:
company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
if company_name:
json_data['company_name'] = str(company_name)
elif platform == PlatformType.QCWY and data_type == DataType.COMPANY:
company_name = data.get('companyName') or data.get('company_name')
if company_name:
json_data['company_name'] = str(company_name)
elif platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
company_name = data.get('companyName') or data.get('name')
if company_name:
json_data['company_name'] = str(company_name)
async def _prepare_remote_push_data(self, data: Dict[str, Any], data_type: DataType, platform: PlatformType) -> \
Optional[Dict[str, Any]]:
"""准备远程推送数据"""
if data_type != DataType.JOB:
return None
try:
if platform == PlatformType.QCWY:
welfare_list = data.get("jobWelfareCodeDataList")
if isinstance(welfare_list, list):
welfare_str = ",".join(
str(item.get("chineseTitle") or item.get("typeTitle") or item.get("englishTitle") or item.get("code"))
for item in welfare_list if isinstance(item, dict)
)
elif isinstance(welfare_list, str):
welfare_str = welfare_list.replace("[", "").replace("]", "")
else:
welfare_str = ""
raw_location = data.get("location") or ""
if not raw_location:
work_loc = data.get("workLocation") or {}
raw_location = work_loc.get("workAddress") or work_loc.get("address") or ""
if raw_location:
location_val = raw_location
else:
location_val = "位置信息未找到"
raw_area = data.get("jobAreaString") or ""
if not raw_area:
level_detail = data.get("jobAreaLevelDetail") or {}
city_str = level_detail.get("cityString") or ""
landmark_str = level_detail.get("landMarkString") or ""
raw_area = f"{city_str}{landmark_str}".strip()
if raw_area:
area_val = raw_area
else:
area_val = "位置信息未找到"
remote_resp = {
'source_type': '前程无忧',
'name': data.get("companyName"),
'title': data.get("jobName"),
'title_addr': data.get("jobName"),
'description': data.get("jobDescribe"),
'age': "",
'sex': "",
'number': "",
'education': data.get("degreeString"),
'skill': await self.safe_join(data.get("jobTagsForOrder")),
'welfare': welfare_str,
'years': data.get("workYearString"),
'salary': f'{data.get("jobSalaryMax", "")}-{data.get("jobSalaryMin", "")}',
'location': location_val,
'position': area_val,
'date': data.get("confirmDateString"),
'start_date': data.get("confirmDateString"),
'end_date': "",
'job_type': data.get("termStr"),
'size': data.get("companySizeString"),
'employer_type': data.get("companyTypeString"),
'industry': f'{data.get("major1Str", "")}-{data.get("major2Str", "")}',
'job_1st_class': "",
'job_2nd_class': "",
'job_3rd_class': "",
'job_4th_class': "",
'url': data.get("jobHref"),
'company_id': data.get("coId"),
'company_name': data.get("fullCompanyName"),
'company_url': data.get("companyHref"),
'company_desc': data.get("company_desc", ""),
'base_data':data
}
return remote_resp
elif platform == PlatformType.BOSS:
bossBaseInfoVO = data.get("bossBaseInfoVO", {})
jobBaseInfoVO = data.get("jobBaseInfoVO", {})
brandComInfoVO = data.get("brandComInfoVO", {})
boss_resp = {
'source_type': 'Boss直聘',
'name': await self.safe_get(brandComInfoVO, "brandName"),
'common_name': await self.safe_get(bossBaseInfoVO, "brandName"),
'title': await self.safe_get(jobBaseInfoVO, "positionName"),
'title_addr': await self.safe_get(jobBaseInfoVO, "positionName"),
'description': await self.safe_get(jobBaseInfoVO, "jobDesc"),
'education': await self.safe_get(jobBaseInfoVO, "degreeName"),
'skill': await self.safe_join(jobBaseInfoVO.get("requiredSkills") if jobBaseInfoVO else None),
'welfare': await self.safe_join(jobBaseInfoVO.get("salaryWelfareInfo") if jobBaseInfoVO else None),
'years': await self.safe_get(jobBaseInfoVO, "experienceName"),
'salary': f'{await self.safe_get(jobBaseInfoVO, "lowSalary")}-{await self.safe_get(jobBaseInfoVO, "highSalary")}',
'location': await self.safe_get(jobBaseInfoVO, "locationName", "位置信息未找到"),
'position': await self.safe_get(jobBaseInfoVO, "locationDesc", "位置信息未找到"),
'job_type': "全职",
'size': await self.safe_get(brandComInfoVO, "scaleName"),
'employer_type': "全职",
'industry': await self.safe_get(brandComInfoVO, "industryName"),
'job_1st_class': "",
'job_2nd_class': "",
'job_3rd_class': "",
'job_4th_class': "",
'date': "",
'start_date': "",
'end_date': "",
'age': "",
'sex': "",
'number': "",
'url': f"https://www.zhipin.com/job_detail/{await self.safe_get(jobBaseInfoVO, 'encryptJobId')}.html",
'company_id': await self.safe_get(brandComInfoVO, "encryptBrandId"),
'company_name': await self.safe_get(brandComInfoVO, "brandName"),
'company_url': f"https://www.zhipin.com/gongsi/{await self.safe_get(brandComInfoVO, 'encryptBrandId')}.html",
'company_desc': await self.safe_get(brandComInfoVO, "introduce"),
'base_data': data
}
return boss_resp
elif platform == PlatformType.ZHILIAN:
# 智联平台:从原始 data 中提取所需字段
zhilian_resp = {
'source_type': '智联招聘',
'name': await self.safe_get(data, 'companyName'),
'common_name': await self.safe_get(data, 'companyName'),
'title': await self.safe_get(data, 'name'),
'title_addr': await self.safe_get(data, 'name'),
'description': await self.safe_get(data, 'jobSummary'),
'education': await self.safe_get(data, 'education'),
'skill': await self.safe_join([tag['value'] for tag in data.get('skillLabel', [])]),
'welfare': '', # 智联该条数据无福利字段
'years': await self.safe_get(data, 'workingExp'),
'salary': await self.safe_get(data, 'salary60'),
'location': f"{await self.safe_get(data, 'workCity')}{await self.safe_get(data, 'cityDistrict')}",
'position': f"{await self.safe_get(data, 'workCity')}{await self.safe_get(data, 'cityDistrict')}",
'job_type': await self.safe_get(data, 'workType'),
'size': await self.safe_get(data, 'companySize'),
'employer_type': await self.safe_get(data, 'propertyName'),
'industry': await self.safe_get(data, 'industryName'),
'job_1st_class': '',
'job_2nd_class': '',
'job_3rd_class': '',
'job_4th_class': '',
'date': await self.safe_get(data, 'firstPublishTime'),
'start_date': '',
'end_date': '',
'age': '',
'sex': '',
'number': str(await self.safe_get(data, 'recruitNumber')),
'url': await self.safe_get(data, 'positionURL'),
'company_id': str(await self.safe_get(data, 'companyId')),
'company_name': await self.safe_get(data, 'companyName'),
'company_url': await self.safe_get(data, 'companyUrl'),
'company_desc': await self.safe_get(data, 'companyDesc'),
'base_data': data
}
return zhilian_resp
except Exception as e:
logger.error(f"准备远程推送数据失败: {str(e)}")
return None
async def _batch_insert_to_clickhouse(self, table_name: str, data_list: List[Dict[str, Any]],
ignore_duplicates: bool = True) -> Dict[str, int]:
"""批量插入数据到ClickHouse支持忽略重复数据
Args:
table_name: 表名
data_list: 数据列表
ignore_duplicates: 是否忽略重复数据
Returns:
插入结果统计 {"inserted": 插入数量, "ignored": 忽略数量}
"""
result = {"inserted": 0, "ignored": 0}
if not data_list:
return result
try:
columns = list(data_list[0].keys())
filtered_list = data_list
if ignore_duplicates:
dedup_cols = self._get_dedup_columns_for_table(table_name)
if dedup_cols:
if len(dedup_cols) == 1:
key_col = dedup_cols[0]
candidate_keys = list({str(d.get(key_col, "")) for d in data_list if d.get(key_col)})
if candidate_keys:
query = f"""
SELECT {key_col}
FROM job_data.{table_name}
WHERE {key_col} IN {{keys:Array(String)}}
"""
existing = await self.clickhouse_client.query(query, parameters={"keys": candidate_keys})
existing_set = {str(r[0]) for r in existing.result_rows}
filtered_list = [d for d in data_list if str(d.get(key_col, "")) not in existing_set]
elif len(dedup_cols) == 2:
c1, c2 = dedup_cols
candidate_c1 = list({str(d.get(c1, "")) for d in data_list if d.get(c1)})
if candidate_c1:
query = f"""
SELECT {c1}, {c2}
FROM job_data.{table_name}
WHERE {c1} IN {{keys:Array(String)}}
"""
existing = await self.clickhouse_client.query(query, parameters={"keys": candidate_c1})
existing_map = {}
for r in existing.result_rows:
k = str(r[0])
v = str(r[1])
existing_map.setdefault(k, set()).add(v)
filtered_list = [
d for d in data_list
if str(d.get(c1, "")) not in existing_map or str(d.get(c2, "")) not in existing_map.get(str(d.get(c1, "")), set())
]
batch_values = [[item.get(col) for col in columns] for item in filtered_list]
if batch_values:
await self.clickhouse_client.insert(f"job_data.{table_name}", batch_values, column_names=columns)
result["inserted"] = len(batch_values)
result["ignored"] = len(data_list) - result["inserted"]
except Exception as e:
logger.error(f"批量插入到表 {table_name} 失败: {str(e)}")
raise e
return result
def _get_dedup_columns_for_table(self, table_name: str) -> List[str]:
"""获取表的去重列"""
if table_name == "boss_job":
return ["job_id"]
if table_name == "qcwy_job":
return ["job_id", "update_date_time"]
if table_name == "zhilian_job":
return ["number", "first_publish_time"]
if table_name in ("boss_company", "qcwy_company", "zhilian_company"):
return ["company_name"]
return []
async def _batch_send_to_remote_server(self, data_list: List[Dict[str, Any]]) -> None:
"""批量发送数据到远程服务器"""
for data in data_list:
try:
await self.send_to_remote_server(data)
except Exception as e:
logger.error(f"批量推送单条数据失败: {str(e)}")
# 继续处理下一条数据
async def _fallback_individual_store(self, data_list: List[Dict[str, Any]], data_type: DataType,
platform: PlatformType, check_duplicate: bool) -> Dict[str, Any]:
"""回退到逐个存储的方法"""
results = {
"total": len(data_list),
"success": 0,
"failed": 0,
"duplicate": 0,
"errors": []
}
for i, data in enumerate(data_list):
try:
result = await self.store_data(data, data_type, platform, check_duplicate)
if result["success"]:
results["success"] += 1
elif result.get("duplicate"):
results["duplicate"] += 1
else:
results["failed"] += 1
results["errors"].append({
"index": i,
"error": result.get("message", "未知错误")
})
except Exception as e:
results["failed"] += 1
results["errors"].append({
"index": i,
"error": str(e)
})
return results
# 创建全局实例的工厂函数
def create_data_router_service(clickhouse_client: AsyncClient) -> DataRouterService:
return DataRouterService(clickhouse_client)

View File

@ -1,46 +0,0 @@
FROM python:3.11-slim
# 配置 apt-get 使用阿里云镜像源
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources || \
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list || true
# Install system dependencies
# Node.js is required for PyExecJS
RUN apt-get update && apt-get install -y \
nodejs \
npm \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# 配置 npm 使用淘宝镜像源
RUN npm config set registry https://registry.npmmirror.com
WORKDIR /app
# 配置 pip 使用国内镜像源(创建配置文件)
RUN mkdir -p /root/.pip && \
echo '[global]' > /root/.pip/pip.conf && \
echo 'index-url = https://pypi.tuna.tsinghua.edu.cn/simple' >> /root/.pip/pip.conf && \
echo 'trusted-host = pypi.tuna.tsinghua.edu.cn' >> /root/.pip/pip.conf && \
echo 'timeout = 120' >> /root/.pip/pip.conf
# Copy requirements first to leverage cache
COPY requirements.txt .
# 使用配置的镜像源安装依赖
RUN pip install --no-cache-dir -r requirements.txt
# Install Playwright browsers and system dependencies
# We only need chromium for this project
# 配置 Playwright 使用国内镜像
ENV PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright
RUN playwright install chromium
RUN playwright install-deps chromium
COPY . .
# Expose the port
EXPOSE 8000
# Run the application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

@ -1,12 +0,0 @@
curl --location 'http://127.0.0.1:9999/api/v1/company/qcwy/search' \
--header 'Content-Type: application/json' \
--data '{
"keyword": "中信期货有限公司"
}'
curl --location 'http://127.0.0.1:9999/api/v1/company/zhilian/search' \
--header 'Content-Type: application/json' \
--data '{
"keyword": "中信期货有限公司",
"city":"北京"
}'

View File

@ -1,65 +0,0 @@
window = {};
var arg3 = null;
var arg4 = null;
var arg5 = null;
var arg6 = null;
var arg7 = null;
var arg8 = null;
var arg9 = null;
var arg10 = null;
var l = function (arg1) {
while (window["_phantom"] || window["__phantomas"]) {
}
var _0x5e8b26 = "3000176000856006061501533003690027800375";
String["prototype"]["hexXor"] = function (_0x4e08d8) {
var _0x5a5d3b = "";
for (var _0xe89588 = 0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
if (_0x189e2c["length"] == 1) {
_0x189e2c = "0" + _0x189e2c;
}
_0x5a5d3b += _0x189e2c;
}
return _0x5a5d3b;
};
String["prototype"]["unsbox"] = function () {
var _0x4b082b = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
var _0x4da0dc = [];
var _0x12605e = "";
for (var _0x20a7bf = 0; _0x20a7bf < this["length"]; _0x20a7bf++) {
var _0x385ee3 = this[_0x20a7bf];
for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
_0x4da0dc[_0x217721] = _0x385ee3;
}
}
}
_0x12605e = _0x4da0dc["join"]("");
return _0x12605e;
};
var _0x23a392 = arg1["unsbox"]();
arg2 = _0x23a392["hexXor"](_0x5e8b26);
console.log('arg2==>', arg2)
// setTimeout("reload(arg2)", 2);
return arg2
};
// var arg1 = "FAA6CB46CF724D58FF82E5310687947623413114";
// l(arg1)

File diff suppressed because it is too large Load Diff

View File

@ -1,91 +0,0 @@
import crypto from 'crypto';
// 使用 crypto
const hmacSHA256 = (message, key) => {
return crypto
.createHmac('sha256', key)
.update(message)
.digest('hex');
};
function A(t) {
console.log(t.url)
console.log(t.data)
var e = t.url + (t.data || "")
return hmacSHA256(e, "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
}
function a(e) {
for (var t = 1; t < arguments.length; t++) {
var n = null != arguments[t] ? arguments[t] : {};
t % 2 ? r(Object(n), !0).forEach((function (t) {
Object(i["a"])(e, t, n[t])
}
)) : Object.getOwnPropertyDescriptors ? Object.defineProperties(e, Object.getOwnPropertyDescriptors(n)) : r(Object(n)).forEach((function (t) {
Object.defineProperty(e, t, Object.getOwnPropertyDescriptor(n, t))
}
))
}
return e
}
var t = {
"transitional": {"silentJSONParsing": true, "forcedJSONParsing": true, "clarifyTimeoutError": false},
"transformRequest": [null],
"transformResponse": [null],
"timeout": 30000,
"xsrfCookieName": "XSRF-TOKEN",
"xsrfHeaderName": "X-XSRF-TOKEN",
"maxContentLength": -1,
"maxBodyLength": -1,
"headers": {
"common": {"Accept": "application/json, text/plain, */*"},
"delete": {},
"get": {"Content-Type": "application/x-www-form-urlencoded"},
"head": {},
"post": {"Content-Type": "application/json"},
"put": {"Content-Type": "application/x-www-form-urlencoded"},
"patch": {"Content-Type": "application/x-www-form-urlencoded"}
},
"baseURL": "https://we.51job.com",
"withCredentials": true,
"url": "/api/job/search-pc?api_key=51job&timestamp=1769136341&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&function=&industry=&jobArea=010000&jobArea2=&landmark=&metro=&salary=&workYear=&degree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=1&requestId=&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&scene=7",
"method": "get",
"property": {"keywordType": ""}
};
var b = {
"partner": "cn_bing_com",
"webId": 2,
"fromdomain": "51job_web",
"frompageUrl": "https://we.51job.com/",
"pageUrl": "https://we.51job.com/pc/search?jobArea=010000&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&keywordType=",
"identityType": "",
"userType": "",
"isLogin": "否",
"accountid": ""
}
console.log(A(t));
// function wordsToHex(words) {
// // CryptoJS 使用 32 位有符号整数存储,需要处理
// let hex = '';
// for (let i = 0; i < words.length; i++) {
// // 将负数转换为无符号整数
// const word = words[i] >>> 0;
// // 转换为十六进制并补零
// hex += word.toString(16).padStart(8, '0');
// }
// return hex;
// }
//
// // 你的数据
// const words = [-762966511, 1702028048, 94455509, -201850815,
// 300412866, 1405396681, 85275542, 246713406];
//
// console.log(wordsToHex(words))
/*
* sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTliZThkNzFmODIxM2YxLTBmZDliOTEwODEzYWE1OC00YzY1N2I1OC0zNjg2NDAwLTE5YmU4ZDcxZjgzMTcxNiJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%7D; ssxmod_itna=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2Yea0xGXKwDA5DnCx7YDt=RcwxK06dvxK=W0mitswe6uDuYLP2GGRRgW_GlDMEHLA6C5N7qxDHwd4KxGLDY=DCqxq57eD4f3Dt4DIDAYDDxDWDYEPxGUQDG=D7rTi5pWtxi3DboaDmd2WC=FD03q=EWFoDDtAbeG2bETqDDNqF9G3_lh3_PD_bW9QKtWemFxPneDMbxGX7YCqnlH2oyDWpFkUsao3xB=gxBQbyPnhwETadZanDY4lGrWYY2DIjGxWxiGG1i05Q03nwsWmwlG1Gv_GDxhw4SrUDDAt_hWDHBRqW0tK2lj5/bc_9yYtAbYW=LgrRxxWqqRAOIotBhNi47fD5=4qf0esGthu5oiPeD; ssxmod_itna2=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2YeaKxDfrQfQGh4qBFjq03_jSefWDlO03BqKSSfAaeFuhD2y0F5nKj4LMzWF2qLViLAjiLzGteYAj1KAULY4hzS3=uiLiHAktq7AQK04=RCrN4_lNnDaNPYDr4nhTEjfu/3d5Fcwil7pUxfDu7yjj5TT0UnkUbM4F0FALQk19oO64i1g2QsibdzqxtPn8oOB3wpj5FVm6R_LF2EKxZIWFfaGt9oNT4U_0IjQx40hUsUKLNOBzuR1Mh=_gTlLdLS53B3OE4dGDB8GdjhOf4MYhuE37oTUMtTCwOOD7WhhjwgohMumFghOCNeDxRqr92NTeIRW=oOeThvw7DBG5/DoFShd7v5ZxwYEKiDD
* */

View File

@ -1,327 +0,0 @@
import hashlib
import hmac
import json
import execjs
import re
import time
import uuid
import requests
from urllib.parse import unquote, quote
from typing import Optional, Dict
import os
class SignGenerator:
def __init__(self):
# 签名密钥从JS代码中获取
self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
self.secret_key_bytes = self.secret_key.encode('utf-8')
def hmac_sha256(self, message, key):
"""HMAC-SHA256签名"""
key_bytes = key.encode('utf-8') if isinstance(key, str) else key
message_bytes = message.encode('utf-8') if isinstance(message, str) else message
signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
return signature.hexdigest()
def generate_signature(self, t):
"""
生成签名对应JS中的函数A
JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
"""
# 获取URL
url = t.get("url", "")
# 获取data如果不存在则使用空字符串
data = t.get("data", "")
if data and isinstance(data, dict):
# 如果data是字典转换为字符串
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
# 拼接字符串
message = url + data
# 生成签名
signature = self.hmac_sha256(message, self.secret_key)
return signature
def generate_signature_from_components(self, url, data=None):
"""从URL和data生成签名更简单的接口"""
if data is None:
data = ""
elif isinstance(data, dict):
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
message = url + data
return self.hmac_sha256(message, self.secret_key)
def generate_acw_sc__v2(self, arg1):
"""生成cookies acw_sc__v2"""
# 获取当前文件所在目录
current_dir = os.path.dirname(os.path.abspath(__file__))
js_file_path = os.path.join(current_dir, '04.js')
with open(js_file_path, 'r', encoding='utf-8') as f:
js = f.read()
acw_sc__v2 = execjs.compile(js).call('l', arg1)
return acw_sc__v2 if acw_sc__v2 else None
def generate_company_detail(self, cid: str) -> dict:
timestamp = int(time.time())
# 待签名的字符串
message = f"/open/noauth/company-info/pc-info?api_key=51job&timestamp={timestamp}&encryCompanyId={cid}"
secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
# 进行 HMAC-SHA256 签名
signature = hmac.new(
key=secret.encode("utf-8"),
msg=message.encode("utf-8"),
digestmod=hashlib.sha256
).hexdigest()
return {"signature": signature, "timestamp": timestamp}
def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
"""
根据关键字搜索公司信息
Args:
keyword: 搜索关键字公司名称
job_area: 工作区域代码默认"000000"表示全国
Returns:
如果找到匹配的公司返回包含fullCompanyName, companyName, companyHref的字典
否则返回None
"""
signer = SignGenerator()
session = requests.Session()
# 生成时间戳
timestamp = str(int(time.time()))
# 构建请求参数
params = {
'api_key': '51job',
'timestamp': timestamp,
'keyword': keyword,
'searchType': '2', # 2表示搜索公司
'function': '',
'industry': '',
'jobArea': job_area,
'jobArea2': '',
'landmark': '',
'metro': '',
'salary': '',
'workYear': '',
'degree': '',
'companyType': '',
'companySize': '',
'jobType': '',
'issueDate': '',
'sortType': '0',
'pageNum': '1',
'requestId': '',
'pageSize': '20',
'source': '1',
'accountId': '',
'pageCode': 'sou|sou|soulb',
'scene': '7'
}
# 构建URL用于生成签名
# 注意签名时keyword需要URL编码其他参数保持原样
url_path = '/api/job/search-pc'
query_parts = []
for k, v in params.items():
if v:
# keyword参数需要URL编码与浏览器行为一致
if k == 'keyword':
query_parts.append(f'{k}={quote(str(v))}')
else:
query_parts.append(f'{k}={str(v)}')
else:
query_parts.append(f'{k}=')
query_string = '&'.join(query_parts)
full_url = f"{url_path}?{query_string}"
# 生成签名
sign = signer.generate_signature_from_components(full_url)
# 构建请求头
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'From-Domain': '51job_web',
'Pragma': 'no-cache',
'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'account-id': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sign': sign,
'user-token': '',
'uuid': str(uuid.uuid4()).replace('-', ''),
}
try:
# 第一次请求可能会返回arg1用于生成acw_sc__v2
response = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
verify=False,
timeout=30
)
# 检查是否需要处理acw_sc__v2反爬虫
if 'arg1' in response.text:
# 提取arg1
arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
if arg1_match:
arg1 = arg1_match[0]
# 生成acw_sc__v2
acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
if acw_sc__v2:
# 生成guid
guid = str(uuid.uuid4()).replace("-", "")
cookies = {
'guid': guid,
'acw_sc__v2': acw_sc__v2
}
# 第二次请求带上cookies
response2 = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
cookies=cookies,
verify=False,
timeout=30
)
# 更新cookies
cookies.update(response2.cookies.get_dict())
# 第三次请求使用完整的cookies
response = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
cookies=cookies,
verify=False,
timeout=30
)
# 解析响应
if response.status_code == 200:
try:
data = response.json()
# print(data)
if data.get('status') == '1' and 'resultbody' in data:
resultbody = data['resultbody']
if 'job' in resultbody and 'items' in resultbody['job']:
items = resultbody['job']['items']
# 遍历所有职位,查找匹配的公司
for item in items:
print(item)
full_company_name = item.get('fullCompanyName', '').strip()
if full_company_name == keyword.strip():
return {
'fullCompanyName': full_company_name,
'companyName': item.get('companyName', '').strip(),
'companyHref': item.get('companyHref', '').strip()
}
except json.JSONDecodeError:
print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
return None
return None
except Exception as e:
print(f"[错误] 请求失败: {e}")
import traceback
print(traceback.format_exc())
return None
def parse_json_company_desc(uri: str) -> dict:
"""解析HTML页面返回字典格式不使用pandas
Args:
uri: 页面URL
Returns:
包含location和company_desc的字典
"""
"https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
sy = uri.split("/")[-1].replace(".html", "")
# print(sy)
if sy.startswith("co"):
cid = sy.replace("co", "")
else:
cid = sy
signer = SignGenerator()
generate_company_detail_info = signer.generate_company_detail(cid)
headers = {
'Host': 'cupid.51job.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'sign': generate_company_detail_info["signature"],
'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
'From-Domain': '51job_web',
'account-id': '',
'user-token': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
'Origin': 'https://jobs.51job.com',
'Connection': 'keep-alive',
'Referer': 'https://jobs.51job.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'TE': 'trailers',
}
try:
# 使用已配置的代理发送请求
desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job&timestamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
res = requests.get(url=desc_url, headers=headers, verify=False)
# print(res.text)
if not res:
return {"company_desc": "请求失败", "company_location": "请求失败"}
company_dinfo = res.json()
print(company_dinfo)
# print(company_dinfo["resultbody"])
coinfo = company_dinfo["resultbody"]["coinfo"]
return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
except Exception as e:
print(f"解析HTML失败: {e}")
return {"company_desc": "解析失败", "company_location": "解析失败"}
# 使用示例
if __name__ == "__main__":
# 测试搜索
keyword = "华为技术有限公司"
result = search_company(keyword)
if result:
print(f"找到匹配的公司:")
print(f" 全称: {result['fullCompanyName']}")
print(f" 简称: {result['companyName']}")
print(f" 链接: {result['companyHref']}")
else:
print(f"未找到匹配的公司: {keyword}")
print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))

View File

@ -1,92 +0,0 @@
# -*- coding: UTF-8 -*-
import time
import uuid
import requests
import re
import json
import execjs
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'From-Domain': '51job_web',
'Pragma': 'no-cache',
'Referer': 'https://we.51job.com/pc/search?keyword=java&searchType=2&sortType=0&metro=',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'account-id': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3Djava%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sign': '839932c059141791d8a003f0e6652e14facbf788a502df374fecf9c107d93b9e',
'user-token': '',
'uuid': '1687228791235576552',
}
params = {
'api_key': '51job',
'timestamp': '1769139097',
'keyword': '华为技术有限公司',
'searchType': '2',
'function': '',
'industry': '',
'jobArea': '000000',
'jobArea2': '',
'landmark': '',
'metro': '',
'salary': '',
'workYear': '',
'degree': '',
'companyType': '',
'companySize': '',
'jobType': '',
'issueDate': '',
'sortType': '0',
'pageNum': '1',
'requestId': '',
'pageSize': '20',
'source': '1',
'accountId': '',
'pageCode': 'sou|sou|soulb',
'scene':'7'
}
# 换成自己的代理或者不用单个ip应该有限制
proxies = {
"http":"http://xxx",
"https":"http://xxxx"
}
for i in range(1,2):
try:
# cookie = {'guid': 'd02dfbabd84858301947663946e1710f'}
session = requests.session()
print("%s次请求:" % i)
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,verify=False) # 关键禁用SSL验证)
print(response.text[:300])
arg1 = re.findall("arg1='(.*?)';",response.text,re.S)[0]
print('arg1--->',arg1)
guid = str(uuid.uuid4()).replace("-", "")
cookie = {'guid': str(guid)}
with open('04.js', 'r', encoding='utf-8') as f:
js = f.read()
acw_sc__v2 = execjs.compile(js).call('l', arg1)
print('acw_sc__v2-->',acw_sc__v2)
cookie.update({"acw_sc__v2": acw_sc__v2})
# cookie.update({"acw_sc__v3": "649257ebe376df87b3db6a94c1e5ad37f42f783b"})
response2 = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,cookies=cookie,verify=False) #
cookie.update(response2.cookies.get_dict())
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers, cookies=cookie,verify=False)
print(response.text)
time.sleep(0.5)
except Exception as e:
print(e)

View File

@ -1 +0,0 @@
# 智联招聘

File diff suppressed because it is too large Load Diff

View File

@ -1,68 +0,0 @@
import math
import copy
R = "0123456789ABCDEFGHIJKLMNOPQRSTUV"
def a(e: str) -> str:
t = int(e, 2)
return R[t]
def n(e: str) -> str:
t = ""
# 等价 charCodeAt + 16bit
for ch in e:
t += format(ord(ch), "016b")
# padEnd 到 5 的倍数
pad_len = 5 * math.ceil(len(t) / 5)
t = t.ljust(pad_len, "0")
r = ""
for i in range(0, len(t), 5):
r += a(t[i:i + 5])
return r
def generate_url(e: dict) -> str:
t = []
o = copy.deepcopy(e)
if o.get("jl"):
t.append(f"jl{o['jl']}")
del o["jl"]
if o.get("jt"):
t.append(f"jt{o['jt']}")
del o["jt"]
if o.get("in"):
t.append(f"in{o['in']}")
del o["in"]
if o.get("kw"):
t.append(f"kw{n(o['kw'])}")
del o["kw"]
if o.get("p"):
t.append(f"p{o['p']}")
del o["p"]
r = []
for key, value in o.items():
if value:
r.append(f"{key}={value}")
a_path = "/".join(t)
if r:
a_path += "?" + "&".join(r)
return a_path
if __name__ == '__main__':
url = f"https://www.zhaopin.com/sou/{generate_url({'jl': 530, 'kw': 'app推广经理'})}"
print(url)

View File

@ -1,284 +0,0 @@
from playwright.sync_api import sync_playwright, BrowserContext, Page
import time
import json
import os
from typing import List, Dict, Optional
from company_spider.zhilianzhaopin_company.searcc_kw import generate_url
class CityLoader:
_instance = None
def __new__(cls, *args, **kwargs):
if not cls._instance:
cls._instance = super(CityLoader, cls).__new__(cls)
return cls._instance
def __init__(self, city_file="city.json"):
if hasattr(self, 'city_map'):
return
current_dir = os.path.dirname(os.path.abspath(__file__))
self.file_path = os.path.join(current_dir, city_file)
self.city_map = {}
self._load_cities()
def _load_cities(self):
if not os.path.exists(self.file_path):
print(f"City file not found: {self.file_path}")
return
try:
with open(self.file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self._parse_city_data(data.get("allCity", []))
except Exception as e:
print(f"Error loading city file: {e}")
def _parse_city_data(self, cities):
for city in cities:
self.city_map[city['name']] = city['code']
if 'sublist' in city and city['sublist']:
self._parse_city_data(city['sublist'])
def get_code(self, city_name):
return self.city_map.get(city_name)
def get_companies_from_page(page: Page) -> List[Dict[str, str]]:
"""从搜索结果页面获取公司名称和链接"""
companies = []
# 尝试多种选择器来定位公司名称
company_selectors = [
'a[class*="company"]',
'.company-name a',
'a.company-name',
'[class*="CompanyName"] a',
'a[href*="/company/"]'
]
company_elements = []
for selector in company_selectors:
try:
elements = page.query_selector_all(selector)
if elements:
company_elements = elements
print(f"使用选择器找到 {len(elements)} 个元素: {selector}")
break
except:
continue
# 如果没找到,尝试更通用的方法
if not company_elements:
all_links = page.query_selector_all('a[href*="company"]')
company_elements = all_links
print(f"通过通用方法找到 {len(all_links)} 个公司链接")
# 提取公司信息
company_info_set = set()
for element in company_elements:
try:
company_name = element.inner_text().strip()
company_url = element.get_attribute('href')
if company_name and company_url:
# 处理相对路径
if company_url.startswith('/'):
company_url = f"https://www.zhaopin.com{company_url}"
elif not company_url.startswith('http'):
company_url = f"https://www.zhaopin.com/{company_url}"
# 去重
if company_name not in company_info_set:
company_info_set.add(company_name)
companies.append({
'name': company_name,
'url': company_url
})
except Exception as e:
continue
return companies
def get_company_intro(context: BrowserContext, company_url: str) -> str:
"""获取公司详情简介"""
try:
company_page = context.new_page()
company_page.goto(company_url, wait_until="networkidle", timeout=30000)
time.sleep(2)
# 尝试多种选择器获取公司简介
intro_selectors = [
'.company-intro',
'.company-description',
'[class*="intro"]',
'[class*="description"]',
'.company-info',
'[class*="CompanyIntro"]'
]
company_intro = ""
for selector in intro_selectors:
try:
intro_element = company_page.query_selector(selector)
if intro_element:
company_intro = intro_element.inner_text().strip()
if company_intro:
break
except:
continue
# 如果还是没找到,尝试获取页面主要内容
if not company_intro:
try:
body = company_page.query_selector('body')
if body:
all_text = body.inner_text()
company_intro = all_text[:500]
except:
pass
company_page.close()
return company_intro if company_intro else "未找到公司简介"
except Exception as e:
return f"获取失败: {str(e)}"
def crawl_companies(params: Dict, max_companies: int = 10, headless: bool = False, proxy: Optional[str] = None) -> List[Dict]:
"""
爬取智联招聘公司信息
Args:
params: 搜索参数 {'jl': 530, 'kw': 'app推广经理'} {'city': '北京', 'kw': '...'}
max_companies: 最多爬取的公司数量默认10
headless: 是否无头模式默认False
proxy: 代理地址例如 "http://user:pass@host:port"
Returns:
公司信息列表每个元素包含 name, url, intro
如果找到完全匹配的公司名称只返回该公司的信息列表长度为1
"""
# 处理城市名称转代码
if 'city' in params and 'jl' not in params:
city_loader = CityLoader()
code = city_loader.get_code(params['city'])
if code:
print(f"城市 '{params['city']}' 映射代码为: {code}")
params['jl'] = code
else:
print(f"未找到城市 '{params['city']}' 的代码")
target_company = params.get('kw', '').strip()
with sync_playwright() as p:
launch_args = ["--disable-blink-features=AutomationControlled"]
browser_kwargs = {
"headless": headless,
"args": launch_args
}
# 尝试使用本地Chrome如果不存在则使用默认浏览器
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
if os.path.exists(chrome_path):
browser_kwargs["executable_path"] = chrome_path
if proxy:
browser_kwargs["proxy"] = {"server": proxy}
print(f"使用代理: {proxy}")
browser = p.chromium.launch(**browser_kwargs)
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = context.new_page()
# 生成URL并访问搜索页面
url = f"https://www.zhaopin.com/sou/{generate_url(params)}"
print(f"访问URL: {url}")
page.goto(url, wait_until="networkidle", timeout=30000)
time.sleep(3)
# 获取公司列表
companies = get_companies_from_page(page)
print(f"\n找到 {len(companies)} 家公司")
# 如果有关键词,尝试精确匹配公司名称
if target_company:
print(f"搜索目标公司: {target_company}")
for company in companies:
company_name = company['name'].strip()
# 优先精确匹配,如果精确匹配失败则尝试包含匹配
if company_name == target_company:
print(f"找到完全匹配的公司: {company_name}")
print(f"正在获取公司简介...")
company_intro = get_company_intro(context, company['url'])
context.close()
browser.close()
return [{
'name': company_name,
'url': company['url'],
'intro': company_intro
}]
# 如果精确匹配失败,尝试包含匹配
for company in companies:
company_name = company['name'].strip()
if target_company in company_name or company_name in target_company:
print(f"找到部分匹配的公司: {company_name}")
print(f"正在获取公司简介...")
company_intro = get_company_intro(context, company['url'])
context.close()
browser.close()
return [{
'name': company_name,
'url': company['url'],
'intro': company_intro
}]
# 如果没有找到匹配的公司,按原逻辑获取多家公司
print(f"未找到完全匹配的公司,获取前 {max_companies} 家公司信息")
results = []
for i, company in enumerate(companies[:max_companies], 1):
print(f"\n[{i}/{min(max_companies, len(companies))}] 正在获取: {company['name']}")
company_intro = get_company_intro(context, company['url'])
results.append({
'name': company['name'],
'url': company['url'],
'intro': company_intro
})
time.sleep(1)
context.close()
browser.close()
return results
if __name__ == '__main__':
# 测试代码
params = {'city': '北京', 'kw': 'app推广经理'}
results = crawl_companies(params, max_companies=10)
# 输出结果
print("\n" + "="*80)
print("爬取结果:")
print("="*80)
for result in results:
print(f"\n公司名称: {result['name']}")
print(f"公司链接: {result['url']}")
print(f"公司简介: {result['intro'][:200]}..." if len(result['intro']) > 200 else f"公司简介: {result['intro']}")
print("-"*80)

View File

@ -0,0 +1,593 @@
# 爬虫数据上报接口文档
> 适用版本JobData v1.x
> 更新日期2026-03-20
> 目标读者:接手爬虫开发或后端对接的工程师
---
## 目录
1. [整体架构](#1-整体架构)
2. [认证方式](#2-认证方式)
3. [核心上报接口](#3-核心上报接口)
- 3.1 [异步批量上报(推荐)](#31-异步批量上报推荐)
- 3.2 [同步批量上报](#32-同步批量上报)
- 3.3 [同步单条上报](#33-同步单条上报)
- 3.4 [平台专属便捷接口](#34-平台专属便捷接口)
4. [各平台数据结构](#4-各平台数据结构)
- 4.1 [BOSS直聘](#41-boss直聘-platformboss)
- 4.2 [前程无忧](#42-前程无忧-platformqcwy)
- 4.3 [智联招聘](#43-智联招聘-platformzhilian)
5. [去重规则](#5-去重规则)
6. [爬虫调用示例](#6-爬虫调用示例)
7. [辅助接口](#7-辅助接口)
8. [数据存储说明](#8-数据存储说明)
9. [常见问题](#9-常见问题)
---
## 1. 整体架构
```
爬虫Boss / 前程无忧 / 智联)
│ POST /api/v1/universal/data/batch-store-async
FastAPI 后端app/
├── 去重检查ClickHouse 查最近 90 天)
├── 写入 ClickHousejob_data 库)
└── 转发至外部数据平台qixin.com
```
三个平台的爬虫**调用同一套接口**,通过 `platform` 字段区分来源,通过 `data_type` 字段区分数据类型(职位/公司)。
---
## 2. 认证方式
数据上报接口属于**内部接口,无需鉴权**。
爬虫调用时统一在 Header 中携带:
```
token: dev
```
> 说明:`dev` 是开发模式 Token后端不验证签名直接放行。生产部署如需启用鉴权改用 JWT TokenHS256有效期 7 天)。
---
## 3. 核心上报接口
**Base URL**(本地开发):`http://localhost:8000`
两个路径前缀完全等价,行为相同:
- `/api/v1/universal`
- `/api/v1/job`
---
### 3.1 异步批量上报(推荐)
**三个平台爬虫均使用此接口**,立即返回 202后台异步写入。
```
POST /api/v1/universal/data/batch-store-async
```
**Request Headers**
```
Content-Type: application/json
token: dev
```
**Request Body**
```json
{
"data_list": [
{ ...原始职位或公司 JSON... }
],
"data_type": "job",
"platform": "boss",
"check_duplicate": true
}
```
| 字段 | 类型 | 必填 | 说明 |
|------|------|:----:|------|
| `data_list` | `List[Dict]` | ✅ | 原始数据列表,结构见第 4 节 |
| `data_type` | `string` | ✅ | `job`(职位)或 `company`(公司) |
| `platform` | `string` | ✅ | `boss` / `qcwy` / `zhilian` |
| `check_duplicate` | `bool` | ❌ | 默认 `true``false` 时跳过去重直接写入 |
**ResponseHTTP 202**
```json
{
"code": 202,
"message": "批量数据已加入异步处理队列,共 10 条",
"platform": "boss",
"data_type": "job"
}
```
---
### 3.2 同步批量上报
同步等待全部写入完成后返回,可获得详细的成功/失败统计。
```
POST /api/v1/universal/data/batch-store
```
**Request Body** — 与 3.1 相同。
**ResponseHTTP 200**
```json
{
"code": 200,
"message": "批量处理完成: 成功 8 条,失败 0 条,重复 2 条",
"data": {
"total": 10,
"success": 8,
"failed": 0,
"duplicate": 2,
"errors": []
},
"platform": "boss",
"data_type": "job"
}
```
---
### 3.3 同步单条上报
```
POST /api/v1/universal/data/store
```
**Request Body**
```json
{
"data": { ...单条原始 JSON... },
"data_type": "job",
"platform": "boss",
"check_duplicate": true
}
```
注意:字段名是 `data`(单条),不是 `data_list`
**ResponseHTTP 200**
```json
{
"code": 200,
"message": "JSON数据存储成功",
"data": {
"success": true,
"message": "JSON数据存储成功",
"duplicate": false,
"table_name": "boss_job",
"storage_type": "json"
},
"platform": "boss",
"data_type": "job"
}
```
数据重复时 `duplicate``true``message``"数据重复,跳过插入"`HTTP 仍返回 200。
---
### 3.4 平台专属便捷接口
Request Body 直接传原始 JSON 对象(无需包装 `platform`/`data_type`),等价于 3.3 的 `data` 字段:
| URL | 平台 | 类型 |
|-----|------|------|
| `POST /api/v1/job/boss/job` | BOSS直聘 | 职位 |
| `POST /api/v1/job/boss/company` | BOSS直聘 | 公司 |
| `POST /api/v1/job/qcwy/job` | 前程无忧 | 职位 |
| `POST /api/v1/job/qcwy/company` | 前程无忧 | 公司 |
| `POST /api/v1/job/zhilian/job` | 智联招聘 | 职位 |
| `POST /api/v1/job/zhilian/company` | 智联招聘 | 公司 |
---
## 4. 各平台数据结构
> 以下为 `data_list` 中每个元素的结构,即各平台原始 API 响应体(直接透传,无需转换)。
---
### 4.1 BOSS直聘platform=boss
#### 职位data_type=job
数据来源BOSS 微信小程序接口 `/wapi/zpgeek/miniapp/job/detail.json`
```json
{
"jobBaseInfoVO": {
"jobId": "123456",
"encryptJobId": "abc123",
"positionName": "Python 工程师",
"locationName": "上海",
"locationDesc": "上海市浦东新区XX路XX号",
"jobDesc": "负责数据采集与处理...",
"degreeName": "本科",
"experienceName": "3-5年",
"lowSalary": 15,
"highSalary": 25,
"requiredSkills": ["Python", "爬虫", "ClickHouse"],
"salaryWelfareInfo": ["五险一金", "弹性工作"]
},
"brandComInfoVO": {
"encryptBrandId": "brand_abc",
"brandName": "某科技有限公司",
"industryName": "互联网",
"scaleName": "100-499人",
"introduce": "公司简介..."
},
"bossBaseInfoVO": {
"brandName": "张HR"
}
}
```
**关键去重字段**`jobBaseInfoVO.jobId`
#### 公司data_type=company
数据来源BOSS 微信小程序接口 `/wapi/zpgeek/miniapp/brand/detail.json`
```json
{
"name": "某科技有限公司",
"companyFullInfoVO": {
"name": "某科技有限公司(全称)"
}
}
```
**关键去重字段**`name``companyFullInfoVO.name`(取 company_name
---
### 4.2 前程无忧platform=qcwy
#### 职位data_type=job
数据来源:前程无忧 APP 接口
```json
{
"jobId": "JL123456789",
"updateDateTime": "2026-03-20 10:00:00",
"jobName": "数据工程师",
"companyName": "某公司",
"fullCompanyName": "某公司全称有限公司",
"coId": "CO123456",
"jobDescribe": "岗位职责:...",
"degreeString": "本科",
"workYearString": "3-5年",
"jobSalaryMax": 20000,
"jobSalaryMin": 15000,
"provideSalaryString": "15k-20k",
"termStr": "全职",
"companySizeString": "500-999人",
"companyTypeString": "民营企业",
"major1Str": "互联网/电子商务",
"major2Str": "数据服务",
"jobWelfareCodeDataList": [
{ "chineseTitle": "五险一金", "typeTitle": "社保", "code": "001" }
],
"jobTagsForOrder": ["Python", "Spark", "Hive"],
"location": "上海",
"workLocation": {
"workAddress": "浦东新区",
"address": "上海市浦东新区XX路"
},
"jobAreaString": "上海",
"jobAreaLevelDetail": {
"cityString": "上海",
"landMarkString": "陆家嘴"
},
"confirmDateString": "2026-03-20",
"jobHref": "https://www.51job.com/...",
"companyHref": "https://www.51job.com/..."
}
```
**关键去重字段**`jobId` + `updateDateTime`(两字段联合唯一)
#### 公司data_type=company
```json
{
"companyName": "某公司",
"fullCompanyName": "某公司全称有限公司"
}
```
**关键去重字段**`companyName`
---
### 4.3 智联招聘platform=zhilian
#### 职位data_type=job
数据来源:智联招聘 PC 搜索接口 `https://fe-api.zhaopin.com/c/i/search/positions`
```json
{
"number": "ZL20260320001",
"firstPublishTime": "2026-03-20T10:00:00",
"name": "后端开发工程师",
"jobId": "J001",
"companyName": "某公司",
"companyId": "C001",
"salary60": "15k-25k",
"jobSummary": "职位描述:负责后端服务开发...",
"education": "本科",
"workingExp": "3-5年",
"workType": "全职",
"workCity": "上海",
"cityDistrict": "浦东新区",
"companySize": "500-999人",
"propertyName": "民营企业",
"industryName": "互联网",
"skillLabel": [
{ "value": "Go" },
{ "value": "Python" }
],
"recruitNumber": 3,
"positionURL": "https://www.zhaopin.com/...",
"companyUrl": "https://www.zhaopin.com/...",
"companyDesc": "公司描述(从额外接口补充)"
}
```
**关键去重字段**`number` + `firstPublishTime`(两字段联合唯一)
#### 公司data_type=company
```json
{
"companyName": "某公司",
"name": "某公司"
}
```
**关键去重字段**`companyName``name`
---
## 5. 去重规则
| 平台 | 数据类型 | 去重字段 | ClickHouse 表 |
|------|----------|----------|---------------|
| boss | job | `jobBaseInfoVO.jobId` | `boss_job` |
| boss | company | `name` / `companyFullInfoVO.name` | `boss_company` |
| qcwy | job | `jobId` + `updateDateTime` | `qcwy_job` |
| qcwy | company | `companyName` | `qcwy_company` |
| zhilian | job | `number` + `firstPublishTime` | `zhilian_job` |
| zhilian | company | `companyName` / `name` | `zhilian_company` |
- 去重检查范围:**最近 90 天**内已入库的记录。
- 重复数据不报错,正常返回 200`duplicate: true`
- 传 `check_duplicate: false` 可跳过去重,强制写入(测试时使用)。
---
## 6. 爬虫调用示例
### BOSS直聘`jobs_spider/boss/boos_api.py`
```python
import requests
API_BASE_URL = "http://localhost:8000"
def push_job(zp_data: dict):
"""推送职位数据"""
payload = {
"data_list": [zp_data],
"data_type": "job",
"platform": "boss"
}
resp = requests.post(
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
headers={
"accept": "application/json",
"token": "dev",
"Content-Type": "application/json"
},
json=payload,
timeout=30
)
return resp.json()
def push_company(zp_data: dict):
"""推送公司数据"""
payload = {
"data_list": [zp_data],
"data_type": "company",
"platform": "boss"
}
resp = requests.post(
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
headers={
"accept": "application/json",
"token": "dev",
"Content-Type": "application/json"
},
json=payload,
timeout=30
)
return resp.json()
```
---
### 前程无忧(`jobs_spider/qcwy/qcwy.py`
```python
import requests
import socket
API_BASE_URL = "http://localhost:8000"
local_ip = socket.gethostbyname(socket.gethostname())
def report_data(data: list, data_type: str = "job"):
"""批量上报数据"""
payload = {
"data_list": data,
"data_type": data_type, # "job" 或 "company"
"platform": "qcwy"
}
resp = requests.post(
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
json=payload,
headers={
"accept": "application/json",
"Content-Type": "application/json",
"X-Forwarded-For": local_ip # 传递真实 IP用于日志溯源
},
timeout=300
)
return resp.json()
```
---
### 智联招聘(`jobs_spider/zhilian/zhilian_single.py`
```python
import requests
API_BASE_URL = "http://localhost:8000"
def report_data(data_list: list, data_type: str = "job"):
"""批量上报数据"""
payload = {
"data_list": data_list,
"data_type": data_type, # "job" 或 "company"
"platform": "zhilian"
}
resp = requests.post(
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
json=payload,
headers={
"accept": "application/json",
"Content-Type": "application/json"
},
timeout=300
)
return resp.json()
```
---
## 7. 辅助接口
爬虫运行过程中还会调用以下辅助接口:
| 接口 | 说明 | 主要使用方 |
|------|------|-----------|
| `GET /api/v1/token/tokens?page=1&page_size=10` | 获取可用的 MPT Token 列表 | BOSS爬虫 |
| `GET /api/v1/keyword/available?source=boss&limit=1&reserve=True` | 获取下一个未使用的关键词(城市+职位组合) | BOSS爬虫 |
| `POST /api/v1/keyword/mark-used` | 标记关键词已使用 | BOSS爬虫 |
| `GET /api/v1/stats` | 查询各平台已入库数据量 | 监控/运营 |
| `GET /api/v1/platforms` | 查询支持的平台列表及去重字段配置 | 调试 |
| `GET /api/v1/universal/data?platform=boss&data_type=job&page=1&page_size=20` | 分页查询已入库数据 | 调试 |
### 标记关键词已使用 Request Body
```json
{
"source": "boss",
"ids": [1, 2, 3]
}
```
---
## 8. 数据存储说明
### ClickHouse 表结构
所有表均在 `job_data` 数据库下ENGINE = `MergeTree()`
**通用列(每张表都有):**
| 列名 | 类型 | 说明 |
|------|------|------|
| `id` | UInt64 | 自增 ID |
| `json_data` | String | 原始 JSON 字符串(完整保存) |
| `created_at` | DateTime | 入库时间 |
| `updated_at` | DateTime | 更新时间 |
**各表额外列(用于去重查询):**
| 表名 | 额外列 |
|------|--------|
| `boss_job` | `job_id String` |
| `boss_company` | `company_name String` |
| `qcwy_job` | `job_id String`, `update_date_time String` |
| `qcwy_company` | `company_name String` |
| `zhilian_job` | `number String`, `first_publish_time String` |
| `zhilian_company` | `company_name String` |
### 统一查询视图
`job_analytics` 视图 UNION ALL 三张职位表,提供统一查询入口:
| 列名 | 说明 |
|------|------|
| `source` | 平台来源boss/qcwy/zhilian |
| `job_id` | 职位唯一标识 |
| `position_name` | 职位名称 |
| `company_name` | 公司名称 |
| `salary_text` | 薪资描述 |
| `city` | 城市 |
| `experience_required` | 经验要求 |
| `education` | 学历要求 |
| `created_at` | 入库时间 |
---
## 9. 常见问题
**Q上报返回 202但数据库里查不到数据**
A异步接口的写入有延迟通常 1-5 秒)。改用同步接口 `batch-store` 可立即确认写入结果。
**Q如何判断某条数据是否已存在**
A调用同步单条上报接口响应中 `duplicate: true` 表示已存在。
**Q`check_duplicate: false` 会导致重复数据吗?**
A会。仅在测试/调试时使用,生产环境保持默认 `true`
**Q三个平台的数据结构差异大如何统一分析**
A使用 `job_analytics` 视图,已将三张表的字段映射为统一列名。
**Q爬虫报超时错误怎么处理**
A异步接口 timeout 建议设 30s同步接口因要等待写入完成建议设 300s。若仍超时检查 ClickHouse 连接状态。
**Q`token: dev` 在生产环境安全吗?**
A不安全。生产环境应替换为 JWT Token并在接口上挂载鉴权中间件。
---
*文档由 JobData 项目自动生成,如有疑问联系项目维护者。*

38
ecs_full_pipeline.log Normal file
View File

@ -0,0 +1,38 @@
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 09E71CDD-F721-589B-BF1C-15B63EAD78EC
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=09E71CDD-F721-589B-BF1C-15B63EAD78EC
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-21T12:30:00.005008
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: A14B7A5D-A924-586D-AA89-4D113D5DA2C7
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=A14B7A5D-A924-586D-AA89-4D113D5DA2C7
未获得实例ID终止
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 577C72C7-7099-517D-A96E-1EE59220AAB3
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=577C72C7-7099-517D-A96E-1EE59220AAB3
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-22T12:30:00.015339
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 49E530D3-304A-57CE-9FD4-DCE0F2824B20
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=49E530D3-304A-57CE-9FD4-DCE0F2824B20
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-22T18:30:00.007107

View File

@ -7,7 +7,9 @@ from loguru import logger
from app.core.clickhouse import clickhouse_manager
from app.services.crawler.qcwy import QcwyService
from app.services.job import DataRouterService, DataType, PlatformType
from app.services.ingest import IngestService
from app.services.ingest.remote_push import push_to_remote
from app.services.ingest.configs.qcwy import _build_qcwy_push
from app.settings.config import settings
# 提取 jobId 的正则表达式
@ -16,12 +18,12 @@ JOB_ID_REGEX = re.compile(r'/(\d+)\.html')
class LinkRecleaner:
def __init__(self):
self.qcwy_service = QcwyService()
self.data_router = None
self.ingest_service = None
self.semaphore = asyncio.Semaphore(50) # 限制并发
async def init(self):
ch_client = await clickhouse_manager.get_client()
self.data_router = DataRouterService(ch_client)
self.ingest_service = IngestService(ch_client)
async def get_job_id_from_url(self, url: str) -> Optional[str]:
match = JOB_ID_REGEX.search(url)
@ -58,8 +60,8 @@ class LinkRecleaner:
source = "Crawler"
if data:
# 存入数据库供下次使用
await self.data_router.store_data(
data, DataType.JOB, PlatformType.QCWY, check_duplicate=True
await self.ingest_service.store_single(
"qcwy", "mini", "job", data, check_duplicate=True
)
except Exception as e:
logger.error(f"Crawl failed for {job_id}: {e}")
@ -84,14 +86,12 @@ class LinkRecleaner:
# 4. 准备推送数据
try:
remote_data = await self.data_router._prepare_remote_push_data(
data, DataType.JOB, PlatformType.QCWY
)
remote_data = _build_qcwy_push(data)
if remote_data:
# 5. 发送到第三方
success = await self.data_router.send_to_remote_server(remote_data)
status = "Success" if success else "Failed"
success = await push_to_remote(remote_data)
status = "Success" if success else "Failed"
logger.info(f"[{source}] Push {job_id}: {status}")
return success
else:

2
run.py
View File

@ -13,5 +13,5 @@ if __name__ == "__main__":
host = os.getenv("APP_HOST", "0.0.0.0")
port = int(os.getenv("APP_PORT", "9999"))
workers = int(os.getenv("UVICORN_WORKERS", "20"))
workers = int(os.getenv("UVICORN_WORKERS", "1"))
uvicorn.run("app:app", host=host, port=port, workers=workers, log_config=LOGGING_CONFIG)

154
spiderJobs/core/base.py Normal file
View File

@ -0,0 +1,154 @@
"""
core.base - 通用基类与数据结构
提供所有招聘平台共用的ApiResult, BaseFetcher, BaseSearcher
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Callable, Optional
from spiderJobs.core.http_client import HTTPClient
# ─────────────────────────────────────────────
# 通用数据结构
# ─────────────────────────────────────────────
@dataclass
class ApiResult:
"""所有接口的统一返回结构"""
success: bool
status_code: int
data: Any = None
list: list[dict] = field(default_factory=list)
count: int = 0
is_end_page: bool = True
error: Optional[str] = None
# ─────────────────────────────────────────────
# 通用响应解析(可覆写)
# ─────────────────────────────────────────────
def parse_response(http_code: int, raw: Any) -> ApiResult:
"""
默认响应解析算法
各平台如果格式不同可在子类中覆写 parse_response 方法
"""
biz_code = raw.get("statusCode") if isinstance(raw, dict) else http_code
if http_code != 200 or biz_code != 200:
return ApiResult(
success=False,
status_code=biz_code or http_code,
error=(
raw.get("statusDescription")
or raw.get("message")
or f"请求失败: {biz_code}"
) if isinstance(raw, dict) else f"请求失败: {http_code}",
)
payload = (raw.get("data") or {}) if isinstance(raw, dict) else {}
if isinstance(payload, dict) and "list" in payload:
return ApiResult(
success=True, status_code=200, data=payload,
list=payload.get("list", []),
count=payload.get("count", 0),
is_end_page=payload.get("isEndPage", True),
)
return ApiResult(success=True, status_code=200, data=payload)
# ─────────────────────────────────────────────
# 基础 FetcherGET 详情类)
# ─────────────────────────────────────────────
class BaseFetcher:
"""
单对象接口基类GET 请求
子类需实现:
ENDPOINT: 接口路径
_build_params(): 构建查询参数
可覆写:
parse_response(): 自定义响应解析
"""
ENDPOINT: str = ""
def __init__(self, http_client: HTTPClient):
self._http = http_client
def _build_params(self) -> dict:
raise NotImplementedError
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return parse_response(http_code, raw)
def fetch(self) -> ApiResult:
try:
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
# ─────────────────────────────────────────────
# 基础 Searcher搜索 + 分页类)
# ─────────────────────────────────────────────
class BaseSearcher:
"""
列表接口基类支持分页
子类需实现:
ENDPOINT: 接口路径
_build_params(page_index): 构建请求参数
可覆写:
_request(params): 默认 POST可改为 GET
_parse(): 自定义响应解析
"""
ENDPOINT: str = ""
def __init__(self, page_size: int = 15, http_client: HTTPClient = None):
self.page_size = page_size
self._http = http_client
def _build_params(self, page_index: int) -> dict:
raise NotImplementedError
def _request(self, params: dict) -> tuple[int, Any]:
return self._http.post(self.ENDPOINT, params)
def _parse(self, http_code: int, raw: Any) -> ApiResult:
return parse_response(http_code, raw)
def search(self, page_index: int = 1) -> ApiResult:
params = self._build_params(page_index)
try:
http_code, data = self._request(params)
except Exception as e:
return ApiResult(success=False, status_code=-1, error=str(e))
return self._parse(http_code, data)
def load_all(
self,
max_pages: int = 10,
on_page: Optional[Callable[[ApiResult, int], None]] = None,
) -> list[dict]:
"""分页累积加载"""
all_list: list[dict] = []
for page_index in range(1, max_pages + 1):
result = self.search(page_index=page_index)
if not result.success:
print(f"{page_index} 页失败: {result.error}")
break
all_list.extend(result.list)
if on_page:
on_page(result, page_index)
if result.is_end_page:
break
return all_list

View File

@ -0,0 +1,155 @@
"""
core.http_client - 通用 HTTP 客户端
基于 requests-go自带 Chrome TLS 指纹伪装
支持代理 IP / 隧道代理 / 代理池轮换
与任何招聘平台无关纯粹负责发请求
"""
from __future__ import annotations
import random
from typing import Any, Optional
import requests_go as requests
from requests_go.tls_config import TLS_CHROME_LATEST
class HTTPClient:
"""
通用 HTTP 客户端
Args:
base_url: API 基础地址
default_headers: 默认请求头
proxy: 固定代理地址绑定到 session复用连接
tunnel_proxy: 隧道代理地址每次请求新建 session确保 IP 轮换
proxy_pool: 代理池列表每次请求随机选一个
timeout: 请求超时秒数默认 10
代理优先级: tunnel_proxy > proxy_pool > proxy
三者只用其一即可
代理格式示例:
普通代理: "http://127.0.0.1:7890"
SOCKS5 代理: "socks5://127.0.0.1:1080"
隧道代理: "http://user:pass@tunnel.example.com:12345"
隧道代理(认证): "http://account-zone-xxx:password@proxy.host:port"
隧道代理用法每次请求自动换 IP:
client = HTTPClient(
base_url="https://example.com",
tunnel_proxy="http://user:pass@tunnel.example.com:12345",
)
# 每次 get/post 都会新建 TCP 连接,隧道代理自动分配新 IP
"""
def __init__(
self,
base_url: str,
default_headers: Optional[dict] = None,
proxy: Optional[str] = None,
tunnel_proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
):
self.base_url = base_url
self.default_headers = default_headers or {}
self.timeout = timeout
# 代理配置
self._proxy = proxy
self._tunnel_proxy = tunnel_proxy
self._proxy_pool = proxy_pool
# 创建 session + TLS 指纹
self._session = requests.Session()
self._session.tls_config = TLS_CHROME_LATEST
TLS_CHROME_LATEST.random_ja3 = True
# 固定代理直接设到 session 上
if proxy and not proxy_pool and not tunnel_proxy:
self._session.proxies = {"http": proxy, "https": proxy}
def _new_session(self) -> requests.Session:
"""创建全新 session用于隧道代理 IP 轮换)"""
s = requests.Session()
s.tls_config = TLS_CHROME_LATEST
TLS_CHROME_LATEST.random_ja3 = True
return s
def _get_proxies(self) -> Optional[dict]:
"""获取本次请求的代理配置"""
if self._proxy_pool:
# 代理池:随机选一个,加 #random_hash 打破连接复用
chosen = random.choice(self._proxy_pool)
unique = f"{chosen}#{random.randint(100000, 999999)}"
return {"http": unique, "https": unique}
return None # 固定代理已在 session 上,不需要每次传
def _merge_headers(self, extra: Optional[dict] = None) -> dict:
headers = {**self.default_headers}
if extra:
headers.update(extra)
return headers
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
"""发送 POST 请求"""
merged_headers = self._merge_headers(headers)
# 隧道代理:每次新 session确保 IP 轮换
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.post(
f"{self.base_url}{path}",
json=body,
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
kwargs: dict[str, Any] = {
"json": body,
"headers": merged_headers,
"timeout": self.timeout,
}
proxies = self._get_proxies()
if proxies:
kwargs["proxies"] = proxies
resp = self._session.post(f"{self.base_url}{path}", **kwargs)
return resp.status_code, resp.json()
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
"""发送 GET 请求"""
merged_headers = self._merge_headers(headers)
# 隧道代理:每次新 session确保 IP 轮换
if self._tunnel_proxy:
s = self._new_session()
try:
resp = s.get(
f"{self.base_url}{path}",
params=params,
headers=merged_headers,
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
timeout=self.timeout,
)
return resp.status_code, resp.json()
finally:
s.close()
kwargs: dict[str, Any] = {
"params": params,
"headers": merged_headers,
"timeout": self.timeout,
}
proxies = self._get_proxies()
if proxies:
kwargs["proxies"] = proxies
resp = self._session.get(f"{self.base_url}{path}", **kwargs)
return resp.status_code, resp.json()

View File

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,68 @@
"""
Boss直聘 独立公司爬虫入口
pending_company 队列获取待爬取的 Boss 公司
逐个调用 GetBrandDetail 获取详情并上传
启动:
python -m spiderJobs.platforms.boss.company_main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
BOSS_MPT Boss Token (mpt)
BOSS_WT2 Boss Token (wt2)
"""
from __future__ import annotations
import os
import sys
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from spiderJobs.core.base import BaseFetcher
from spiderJobs.platforms.boss.api import GetBrandDetail
from spiderJobs.platforms.boss.client import BossClient, create_client
from spiderJobs.platforms.boss.sign import BossSign
from spiderJobs.runner.company_loop import run_company_loop
def create_company_fetcher(company_id: str, http_client: BossClient) -> BaseFetcher:
"""创建 Boss 公司详情 fetcher"""
return GetBrandDetail(brand_id=company_id, client=http_client)
def main():
client_kwargs = {}
mpt = os.environ.get("BOSS_MPT", "")
wt2 = os.environ.get("BOSS_WT2", "")
if mpt or wt2:
signer = BossSign(mpt=mpt, wt2=wt2)
client_kwargs["signer"] = signer
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_company_loop(
platform="boss",
create_company_fetcher=create_company_fetcher,
create_client_fn=create_client,
client_kwargs=client_kwargs,
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1 @@

View File

@ -0,0 +1,59 @@
"""
前程无忧 (51Job) 独立公司爬虫入口
pending_company 队列获取待爬取的 51job 公司
逐个调用 GetCompanyInfo 获取详情并上传
启动:
python -m spiderJobs.platforms.job51.company_main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
"""
from __future__ import annotations
import os
import sys
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from spiderJobs.core.base import BaseFetcher
from spiderJobs.platforms.job51.api import GetCompanyInfo
from spiderJobs.platforms.job51.client import Job51Client, create_client
from spiderJobs.runner.company_loop import run_company_loop
def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFetcher:
"""创建 51job 公司详情 fetcher"""
return GetCompanyInfo(company_id=company_id, client=http_client)
def main():
client_kwargs = {}
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_company_loop(
platform="qcwy",
create_company_fetcher=create_company_fetcher,
create_client_fn=create_client,
client_kwargs=client_kwargs,
)
if __name__ == "__main__":
main()

View File

View File

@ -0,0 +1,53 @@
"""
智联招聘 独立公司爬虫入口
pending_company 队列获取待爬取的智联公司
逐个调用 GetCompanyDetail 获取详情并上传
启动:
python -m spiderJobs.platforms.zhilian.company_main
环境变量:
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
"""
from __future__ import annotations
import os
import sys
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
if _project_root not in sys.path:
sys.path.insert(0, _project_root)
from spiderJobs.core.base import BaseFetcher
from spiderJobs.platforms.zhilian.api import GetCompanyDetail
from spiderJobs.platforms.zhilian.client import ZhilianClient, create_cgate_client
from spiderJobs.runner.company_loop import run_company_loop
def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseFetcher:
"""创建智联公司详情 fetcher"""
return GetCompanyDetail(number=company_id, client=http_client)
def main():
client_kwargs = {}
proxy = os.environ.get("PROXY_URL", "")
if proxy:
client_kwargs["proxy"] = proxy
run_company_loop(
platform="zhilian",
create_company_fetcher=create_company_fetcher,
create_client_fn=create_cgate_client,
client_kwargs=client_kwargs,
)
if __name__ == "__main__":
main()

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,4 @@
from spiderJobs.runner.api_client import RunnerAPIClient
from spiderJobs.runner.company_loop import run_company_loop
__all__ = ["RunnerAPIClient", "run_company_loop"]

View File

@ -0,0 +1,214 @@
"""
runner.api_client - 爬虫与后端 API 的通信层
提供关键词获取进度汇报数据上传等功能
爬虫主循环通过此模块与后端交互实现状态管理
"""
from __future__ import annotations
import json
import os
import time
import uuid
from typing import Any, Optional
import requests
class RunnerAPIClient:
"""后端 API 客户端,负责关键词调度与数据上传"""
def __init__(
self,
base_url: str = "",
api_token: str = "dev",
platform: str = "",
crawler_id: str = "",
):
self.base_url = (
base_url
or os.environ.get("API_BASE_URL", "http://127.0.0.1:9999")
).rstrip("/")
self.api_token = api_token or os.environ.get("API_TOKEN", "dev")
self.platform = platform
self.crawler_id = crawler_id or f"{platform}-{uuid.uuid4().hex[:8]}"
self._session = requests.Session()
self._session.headers.update({"token": self.api_token})
# ─────────────────────────────────────────────
# 关键词调度
# ─────────────────────────────────────────────
def fetch_keyword(self, limit: int = 1) -> list[dict]:
"""从后端获取可用关键词(自动原子锁定为 crawling 状态)
返回关键词列表每个元素包含:
id, city, job, last_completed_page, crawl_status
"""
resp = self._get(
"/api/v1/keyword/available",
params={
"source": self.platform,
"limit": limit,
"reserve": "true",
"crawler_id": self.crawler_id,
},
)
print(resp)
if resp and resp.get("code") == 200:
return resp.get("data", {}).get("items", [])
return []
def report_page_progress(
self,
keyword_id: int,
page: int,
total_pages: int = 0,
jobs_found: int = 0,
) -> dict:
"""汇报单页爬取进度"""
return self._post(
"/api/v1/keyword/page-progress",
body={
"source": self.platform,
"keyword_id": keyword_id,
"page": page,
"total_pages": total_pages,
"jobs_found": jobs_found,
},
)
def report_crawl_complete(
self,
keyword_id: int,
status: str = "completed",
error_message: str = "",
) -> dict:
"""汇报爬取完成或失败"""
return self._post(
"/api/v1/keyword/crawl-complete",
body={
"source": self.platform,
"keyword_id": keyword_id,
"status": status,
"error_message": error_message,
},
)
# ─────────────────────────────────────────────
# 数据上传
# ─────────────────────────────────────────────
def upload_data(
self,
data_list: list[dict],
data_type: str = "job",
channel: str = "mini",
) -> dict:
"""批量上传数据到后端(异步入库)"""
if not data_list:
return {"code": 200, "message": "空数据跳过"}
print(
f"[上报] {self.platform}/{data_type} | "
f"条数={len(data_list)} | channel={channel} | "
f"目标={self.base_url}/api/v1/universal/data/batch-store-async"
)
resp = self._post(
"/api/v1/universal/data/batch-store-async",
body={
"data_list": data_list,
"data_type": data_type,
"platform": self.platform,
"channel": channel,
},
)
code = resp.get("code", "?")
msg = resp.get("msg") or resp.get("message", "")
stored = resp.get("data", {}).get("stored", "") if isinstance(resp.get("data"), dict) else ""
print(f"[上报] 响应: code={code} msg={msg} {f'stored={stored}' if stored else ''}")
return resp
# ─────────────────────────────────────────────
# Token 管理Boss 平台需要)
# ─────────────────────────────────────────────
def fetch_token(self) -> Optional[dict]:
"""获取可用的平台 Token"""
resp = self._get(
"/api/v1/token/tokens",
params={"platform": self.platform},
)
if resp and resp.get("code") == 200:
tokens = resp.get("data", [])
return tokens[0] if tokens else None
return None
# ─────────────────────────────────────────────
# 公司队列
# ─────────────────────────────────────────────
def fetch_pending_companies(
self,
limit: int = 10,
status: str = "pending",
) -> list[dict]:
"""从后端获取待爬取公司列表
返回列表每个元素包含:
source, company_id, company_name, status, error_msg
"""
resp = self._get(
"/api/v1/cleaning/companies",
params={
"source": self.platform,
"status": status,
"page_size": limit,
},
)
if resp and resp.get("code") == 200:
return resp.get("data", [])
return []
def update_company_status(
self,
company_id: str,
status: str = "done",
error_message: str = "",
) -> dict:
"""更新公司爬取状态done/failed"""
return self._post(
"/api/v1/cleaning/update-company-status",
body={
"source": self.platform,
"company_id": company_id,
"status": status,
"error_message": error_message,
},
)
# ─────────────────────────────────────────────
# HTTP 底层
# ─────────────────────────────────────────────
def _get(self, path: str, params: dict | None = None) -> dict:
url = f"{self.base_url}{path}"
for attempt in range(3):
try:
resp = self._session.get(url, params=params, timeout=15)
return resp.json()
except Exception as e:
print(f"[API] GET {path}{attempt + 1}次失败: {e}")
time.sleep(2 * (attempt + 1))
return {}
def _post(self, path: str, body: dict) -> dict:
url = f"{self.base_url}{path}"
for attempt in range(3):
try:
resp = self._session.post(url, json=body, timeout=30)
return resp.json()
except Exception as e:
print(f"[API] POST {path}{attempt + 1}次失败: {e}")
time.sleep(2 * (attempt + 1))
return {}

View File

@ -0,0 +1,121 @@
"""
runner.company_loop - 独立公司爬虫主循环
从后端 pending_company 队列获取待爬取公司
逐个调用平台 API 获取公司详情并上传
"""
from __future__ import annotations
import os
import time
import traceback
from typing import Any, Callable
from spiderJobs.core.base import BaseFetcher
from spiderJobs.runner.api_client import RunnerAPIClient
from spiderJobs.runner.loop import sleep_random
def run_company_loop(
*,
platform: str,
create_company_fetcher: Callable[[str, Any], BaseFetcher],
create_client_fn: Callable[..., Any],
batch_size: int = 10,
sleep_min: float = 10,
sleep_max: float = 20,
api_base_url: str = "",
client_kwargs: dict | None = None,
) -> None:
"""独立公司爬虫主循环
Args:
platform: 平台标识 (boss/qcwy/zhilian)
create_company_fetcher: 工厂函数 (company_id, http_client) -> BaseFetcher
create_client_fn: 平台 HTTP client 工厂
batch_size: 每批获取待处理公司数量
sleep_min/max: 请求间随机延迟范围
api_base_url: 后端 API 地址
client_kwargs: 传给 create_client_fn 的额外参数
"""
batch_size = int(os.environ.get("COMPANY_BATCH_SIZE", str(batch_size)))
sleep_min = float(os.environ.get("SLEEP_MIN_SECONDS", str(sleep_min)))
sleep_max = float(os.environ.get("SLEEP_MAX_SECONDS", str(sleep_max)))
api = RunnerAPIClient(
base_url=api_base_url,
platform=platform,
)
print(f"[{platform}-company] 公司爬虫启动 | crawler_id={api.crawler_id}")
print(f"[{platform}-company] API: {api.base_url} | batch={batch_size} | delay={sleep_min}-{sleep_max}s")
http_client = create_client_fn(**(client_kwargs or {}))
while True:
try:
# 1. 获取待爬取公司列表
companies = api.fetch_pending_companies(limit=batch_size, status="pending")
if not companies:
print(f"[{platform}-company] 无待处理公司,等待 120s ...")
time.sleep(120)
continue
print(f"\n[{platform}-company] 获取到 {len(companies)} 个待处理公司")
# 2. 逐个爬取
success_count = 0
fail_count = 0
for company in companies:
company_id = company.get("company_id", "")
company_name = company.get("company_name", "")
if not company_id:
continue
sleep_random(sleep_min, sleep_max)
try:
fetcher = create_company_fetcher(company_id, http_client)
result = fetcher.fetch()
if result.success and result.data:
# 上传公司数据
data_to_upload = result.data if isinstance(result.data, dict) else {"raw": result.data}
api.upload_data([data_to_upload], data_type="company")
# 标记完成
api.update_company_status(company_id, status="done")
success_count += 1
print(f" [OK] {company_name or company_id}")
else:
api.update_company_status(
company_id,
status="failed",
error_message=result.error or "empty data",
)
fail_count += 1
print(f" [FAIL] {company_name or company_id}: {result.error}")
except Exception as e:
api.update_company_status(
company_id,
status="failed",
error_message=str(e)[:500],
)
fail_count += 1
print(f" [ERROR] {company_name or company_id}: {e}")
print(
f"[{platform}-company] 批次完成: 成功={success_count} 失败={fail_count}"
)
except KeyboardInterrupt:
print(f"\n[{platform}-company] 收到中断信号,退出...")
break
except Exception as e:
print(f"[{platform}-company] 主循环异常: {e}")
traceback.print_exc()
time.sleep(30)

View File

@ -0,0 +1,57 @@
import unittest
from app.services.company_jobs_sync import CompanyJobsSyncService
class CompanyJobsSyncServiceTests(unittest.TestCase):
def test_extract_boss_jobs(self):
payload = {
"zpData": {
"jobList": [
{"encryptJobId": "job-1"},
{"encryptJobId": "job-2"},
]
}
}
jobs = CompanyJobsSyncService._extract_boss_jobs(payload)
self.assertEqual(len(jobs), 2)
def test_extract_qcwy_jobs(self):
payload = {
"resultbody": {
"job": {
"items": [
{"jobId": "1001"},
{"jobId": "1002"},
]
}
}
}
jobs = CompanyJobsSyncService._extract_qcwy_jobs(payload)
self.assertEqual(len(jobs), 2)
def test_extract_zhilian_jobs(self):
payload = {
"data": {
"list": [
{"number": "zl-1"},
{"number": "zl-2"},
]
}
}
jobs = CompanyJobsSyncService._extract_zhilian_jobs(payload)
self.assertEqual(len(jobs), 2)
def test_extract_zhilian_jobs_from_top_level_list(self):
payload = {
"list": [
{"number": "zl-top-1"},
{"number": "zl-top-2"},
]
}
jobs = CompanyJobsSyncService._extract_zhilian_jobs(payload)
self.assertEqual(len(jobs), 2)
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,87 @@
import unittest
from app.services.company_storage import extract_company_fields, normalize_company_id
class CompanyStorageTests(unittest.TestCase):
def test_normalize_qcwy_company_id(self):
self.assertEqual(normalize_company_id("qcwy", "co123"), "123")
self.assertEqual(normalize_company_id("qcwy", "123"), "123")
self.assertEqual(normalize_company_id("boss", "co123"), "co123")
def test_extract_boss_fields(self):
payload = {
"zpData": {
"brandComInfoVO": {
"encryptBrandId": "boss-1",
"brandName": "Boss公司",
"industryName": "互联网",
"scaleName": "100-499人",
"stageName": "B轮",
"logo": "https://example.com/logo.png",
"introduce": "品牌简介",
},
"companyFullInfoVO": {
"name": "Boss公司",
"typeName": "民营",
"cityName": "上海",
"address": "上海市徐汇区",
"website": "https://boss.example.com",
},
}
}
result = extract_company_fields("boss", payload, "boss-1")
self.assertEqual(result["source_company_id"], "boss-1")
self.assertEqual(result["company_name"], "Boss公司")
self.assertEqual(result["industry"], "互联网")
self.assertEqual(result["financing_stage"], "B轮")
def test_extract_qcwy_fields(self):
payload = {
"coinfo": {
"coid": "123",
"coname": "前程公司",
"cotype": "民营",
"indtype1": "制造业",
"cosize": "500-999人",
"areaString": "广州",
"caddr": "广州市天河区",
"webUrl": "https://qcwy.example.com",
"logourl": "https://qcwy.example.com/logo.png",
"coinfo": "公司简介",
},
"financingStage": {"name": "未融资"},
}
result = extract_company_fields("qcwy", payload, "co123")
self.assertEqual(result["source_company_id"], "123")
self.assertEqual(result["company_name"], "前程公司")
self.assertEqual(result["company_size"], "500-999人")
self.assertEqual(result["website"], "https://qcwy.example.com")
def test_extract_zhilian_fields(self):
payload = {
"data": {
"companyBase": {
"companyNumber": "zl-1",
"companyName": "智联公司",
"companyTypeName": "上市公司",
"industryName": "教育",
"companySize": "1000-9999人",
"financingStage": {"name": "已上市"},
"cityName": "北京",
"address": "北京市海淀区",
"companyUrl": "https://zl.example.com",
"logoUrl": "https://zl.example.com/logo.png",
"companyDescWithHtml": "<p>公司简介</p>",
}
}
}
result = extract_company_fields("zhilian", payload, "zl-1")
self.assertEqual(result["source_company_id"], "zl-1")
self.assertEqual(result["company_name"], "智联公司")
self.assertEqual(result["company_type"], "上市公司")
self.assertEqual(result["description"], "<p>公司简介</p>")
if __name__ == "__main__":
unittest.main()

174
web/CLAUDE.md Normal file
View File

@ -0,0 +1,174 @@
[根目录](../CLAUDE.md) > **web**
# web - Vue3 前端模块
## 模块职责
基于 Vue 3 + Naive UI 构建的管理后台,提供:用户/角色/权限/菜单/部门的系统管理三平台招聘数据的浏览与搜索数据采集趋势与来源分布的统计看板ECharts定向数据清洗操作关键词与代理 IP 管理。
---
## 入口与启动
| 文件 | 说明 |
|------|------|
| `web/src/main.js` | Vue 应用入口,依次初始化 Store、Router、指令、i18n挂载到 `#app` |
| `web/src/App.vue` | 根组件 |
| `web/src/router/index.js` | Vue Router 配置,含路由守卫(认证、页面 Loading、标题 |
| `web/src/store/index.js` | Pinia Store 入口 |
### 本地开发
```bash
cd web
pnpm install
pnpm dev # Vite dev server默认 http://localhost:5173
pnpm build # 构建到 web/dist/
pnpm lint # ESLint 检查
```
---
## 页面路由结构
| 路由路径 | 视图文件 | 说明 |
|----------|----------|------|
| `/login` | `views/login/index.vue` | 登录页 |
| `/analytics` | `views/analytics/index.vue` | 数据分析看板ECharts 趋势 + 来源饼图) |
| `/recruitment/qcwy` | `views/recruitment/qcwy/index.vue` | 前程无忧数据浏览 |
| `/recruitment/zhilian` | `views/recruitment/zhilian/index.vue` | 智联招聘数据浏览 |
| `/recruitment/boss` | `views/recruitment/boss/index.vue` | Boss 直聘数据浏览 |
| `/cleaning/targeted` | `views/cleaning/index.vue` | 定向数据清洗 |
| `/cleaning/monitor` | `views/cleaning/monitor.vue` | 清洗任务监控 |
| `/keyword` | `views/keyword/index.vue` | 关键词管理 |
| `/profile` | `views/profile/index.vue` | 个人中心 |
| `/system/user` | `views/system/user/index.vue` | 用户管理 |
| `/system/role` | `views/system/role/index.vue` | 角色管理 |
| `/system/menu` | `views/system/menu/index.vue` | 菜单管理 |
| `/system/api` | `views/system/api/index.vue` | API 权限管理 |
| `/system/dept` | `views/system/dept/index.vue` | 部门管理 |
| `/system/auditlog` | `views/system/auditlog/index.vue` | 审计日志 |
| `/system/proxy` | `views/system/proxy/index.vue` | 代理 IP 管理 |
| `/system/token` | `views/system/token/index.vue` | Boss Token 管理 |
---
## 对外接口API 调用层)
前端 API 模块位于 `web/src/api/`,通过 axios 封装:
| 文件 | 接口对象 | 说明 |
|------|----------|------|
| `api/index.js` | 所有系统接口 | 用户、角色、菜单、部门、API 管理等 |
| `api/analytics.js` | `getOverview` / `getVolumeTrend` / `getSourceDistribution` | 数据分析统计接口 |
| `api/keyword.js` | keyword CRUD | 关键词管理 |
| `api/proxy.js` | proxy CRUD | 代理 IP 管理 |
| `api/token.js` | token CRUD | Boss Token 管理 |
HTTP 工具层:`web/src/utils/http/`(基于 axios含拦截器
认证JWT Token 存储于 localStorage通过请求拦截器自动注入 `Authorization` 头。
---
## 关键依赖与配置
```json
{
"主框架": "vue@^3.3.4",
"UI 库": "naive-ui@^2.34.4",
"状态管理": "pinia@^2.1.6",
"路由": "vue-router@^4.2.4",
"图表": "echarts@^6.0.0",
"HTTP": "axios@^1.4.0",
"图标": "@iconify/vue + @iconify/json",
"工具": "@vueuse/core, lodash-es, dayjs",
"构建": "vite@^4.4.6",
"原子 CSS": "unocss@^66.5.10"
}
```
Vite 配置:`web/vite.config.js`(含 `@vitejs/plugin-vue`, `unocss`, `unplugin-icons`)。
---
## 数据模型(前端状态)
Pinia Store 模块(`web/src/store/modules/`
| Store 模块 | 说明 |
|------------|------|
| `user` | 当前登录用户信息、Token |
| `permission` | 动态路由权限(菜单权限列表) |
| `app` | 全局 UI 状态(侧边栏折叠、主题等) |
| `tags` | 多页签Keep-Alive 页签管理) |
---
## 数据分析看板关键实现
`web/src/views/analytics/index.vue` 使用 ECharts 渲染:
- **趋势折线图**:按 hour/day/week/month 粒度,分 boss/qcwy/zhilian 三条折线,支持 dataZoom 交互
- **来源饼图**:环形饼图展示各平台数据占比
- **时间预设**:近 24h / 7d / 30d / 90d / 6m / 12m / 全部 / 自定义
---
## 测试与质量
- 当前无自动化测试文件(缺口)。
- 代码规范ESLint (`@zclzone` + `@unocss` 配置) + `prettier`
- 建议补充Vitest 单元测试(工具函数)和 Playwright E2E 测试(登录、数据查看流程)。
---
## 常见问题 (FAQ)
**Q: 前端无法连接后端 API**
A: 检查 `web/src/utils/http/index.js` 中的 `baseURL` 配置,或在 Vite 配置中设置代理(`vite.config.js``server.proxy`)。
**Q: 动态路由/菜单不更新?**
A: 登出后重新登录会重新拉取后端菜单树。如果菜单在后端已新增,前端 permission store 会在下次路由守卫拦截时重新请求。
**Q: 图表不显示?**
A: 检查 ClickHouse 是否可达(后端 `/api/v1/analytics/overview` 是否返回数据),以及图表容器 `div` 的高度是否为 0。
---
## 相关文件清单
```
web/src/
├── main.js # 应用入口
├── App.vue # 根组件
├── api/ # API 调用层
│ ├── analytics.js
│ ├── keyword.js
│ ├── proxy.js
│ └── token.js
├── components/ # 公共组件CrudTable, CrudModal, QueryBar 等)
├── layout/ # 布局(侧边栏、顶栏、标签页)
├── router/ # 路由配置与守卫
├── store/ # Pinia Store
├── utils/
│ ├── auth/ # JWT Token 工具
│ ├── http/ # axios 封装
│ └── storage/ # localStorage 封装
└── views/
├── analytics/ # 数据分析看板
├── cleaning/ # 数据清理页面
├── keyword/ # 关键词管理
├── recruitment/ # 三平台数据浏览
│ ├── boss/
│ ├── qcwy/
│ └── zhilian/
└── system/ # 系统管理(用户/角色/菜单/部门/代理/Token/审计)
```
---
## 变更记录 (Changelog)
| 日期 | 说明 |
|------|------|
| 2026-03-20 | 初始化模块文档 |

View File

@ -26,7 +26,7 @@ export function createVitePlugins(viteEnv, isBuild) {
open: true,
gzipSize: true,
brotliSize: true,
}),
})
)
}

View File

@ -25,10 +25,10 @@ export default [
Icons({
compiler: 'vue3',
customCollections: {
custom: FileSystemIconLoader(customIconPath)
custom: FileSystemIconLoader(customIconPath),
},
scale: 1,
defaultClass: 'inline-block'
defaultClass: 'inline-block',
}),
Components({
resolvers: [

1162
web/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -3,10 +3,10 @@ import { request } from '@/utils'
export default {
// 获取统计总览 (Total Jobs)
getOverview: (params) => request.get('/analytics/overview', { params }),
// 获取数据量趋势 (Volume Trend)
getVolumeTrend: (params) => request.get('/analytics/trend/volume', { params }),
// 获取数据来源分布 (Source Distribution)
getSourceDistribution: (params) => request.get('/analytics/distribution/source', { params }),
}

View File

@ -53,7 +53,7 @@ const isEmpty = computed(() => props.empty && !props.loading && network.value)
const showPlaceholder = computed(() => props.loading || isEmpty.value || !network.value)
const networkErrorDesc = computed(() =>
props.showNetworkReload ? `${NETWORK_ERROR_MSG}, 点击重试` : NETWORK_ERROR_MSG,
props.showNetworkReload ? `${NETWORK_ERROR_MSG}, 点击重试` : NETWORK_ERROR_MSG
)
function handleReload() {
@ -71,7 +71,7 @@ const stopHandle = watch(
if (!newValue) {
network.value = window.navigator.onLine
}
},
}
)
onUnmounted(() => {

View File

@ -28,7 +28,7 @@ watchDebounced(
filterIcons()
emit('update:value', choosed.value)
},
{ debounce: 200 },
{ debounce: 200 }
)
</script>

View File

@ -3,7 +3,7 @@
<header v-if="showHeader" mb-15 min-h-45 flex items-center justify-between px-15>
<slot v-if="$slots.header" name="header" />
<template v-else>
<h2 text-22 font-normal text-hex-333 dark:text-hex-ccc>{{ title || route.meta?.title }}</h2>
<h2 text-22 text-hex-333 font-normal dark:text-hex-ccc>{{ title || route.meta?.title }}</h2>
<slot name="action" />
</template>
</header>

View File

@ -8,8 +8,8 @@
max-w-150
flex-shrink-0
text-16
font-bold
color-primary
font-bold
>
{{ title }}
</h2>

View File

@ -48,7 +48,7 @@ watch(
const title = route.meta?.title
tagsStore.addTag({ name, path, title })
},
{ immediate: true },
{ immediate: true }
)
watch(
@ -60,7 +60,7 @@ watch(
const { offsetLeft: x, offsetWidth: width } = activeTabElement
scrollXRef.value?.handleScroll(x + width, width)
},
{ immediate: true },
{ immediate: true }
)
const handleTagClick = (path) => {

View File

@ -2,7 +2,7 @@
<AppPage :show-footer="false">
<div flex-1>
<!-- 筛选栏 -->
<n-card rounded-10 mb-15>
<n-card mb-15 rounded-10>
<n-space align="center">
<n-select
v-model:value="query.preset"
@ -27,16 +27,16 @@
</n-card>
<!-- 概览数据 -->
<n-grid :x-gap="15" :y-gap="15" :cols="4" mb-15>
<n-gi>
<n-card rounded-10 size="small">
<n-statistic label="近选定时间段总量" :value="totalFromSources">
<template #prefix>
<TheIcon icon="mdi:database" color="#2080f0" :size="24" />
</template>
</n-statistic>
</n-card>
</n-gi>
<n-grid :x-gap="15" :y-gap="15" :cols="4" mb-15>
<n-gi>
<n-card rounded-10 size="small">
<n-statistic label="近选定时间段总量" :value="totalFromSources">
<template #prefix>
<TheIcon icon="mdi:database" color="#2080f0" :size="24" />
</template>
</n-statistic>
</n-card>
</n-gi>
<n-gi>
<n-card rounded-10 size="small">
<n-statistic label="Boss直聘" :value="getSourceCount('boss')">
@ -77,11 +77,11 @@
<n-grid :x-gap="15" :y-gap="15" :cols="2">
<n-gi>
<n-card title="数据来源占比" rounded-10>
<div ref="sourceChartRef" style="height: 300px"></div>
<n-card title="数据来源占比" rounded-10>
<div ref="sourceChartRef" style="height: 300px"></div>
</n-card>
</n-gi>
<n-gi>
<n-gi>
<n-card title="系统状态" rounded-10>
<div flex items-center justify-center style="height: 300px; color: #999">
<n-result status="success" title="系统运行正常">
@ -93,7 +93,6 @@
</n-card>
</n-gi>
</n-grid>
</div>
</AppPage>
</template>
@ -114,12 +113,14 @@ const dateRange = ref(null)
const overview = ref({
total_jobs: 0,
period: {}
period: {},
})
const trendData = ref([])
const sourceDistribution = ref([])
const totalFromSources = computed(() => sourceDistribution.value.reduce((sum, i) => sum + (i.job_count || 0), 0))
const totalFromSources = computed(() =>
sourceDistribution.value.reduce((sum, i) => sum + (i.job_count || 0), 0)
)
// Charts refs
const trendChartRef = ref(null)
@ -147,7 +148,7 @@ const rangeOptions = [
]
const getSourceCount = (source) => {
const item = sourceDistribution.value.find(i => i.category === source)
const item = sourceDistribution.value.find((i) => i.category === source)
return item ? item.job_count : 0
}
@ -217,21 +218,20 @@ const handleRefresh = () => {
const fetchData = async () => {
try {
const params = getParams()
// Parallel requests
const [overviewRes, trendRes, sourceRes] = await Promise.all([
api.getOverview(params),
api.getVolumeTrend(params),
api.getSourceDistribution(params)
api.getSourceDistribution(params),
])
const unwrap = (res) => (res && res.data) ? res.data : res
const unwrap = (res) => (res && res.data ? res.data : res)
overview.value = unwrap(overviewRes) || { total_jobs: 0, period: {} }
trendData.value = unwrap(trendRes) || []
sourceDistribution.value = unwrap(sourceRes) || []
renderCharts()
} catch (error) {
console.error(error)
message.error('获取数据失败')
@ -246,81 +246,78 @@ const renderCharts = () => {
const renderTrendChart = () => {
if (!trendChartRef.value) return
if (!trendChart) trendChart = echarts.init(trendChartRef.value)
// Process data for multi-series line chart
const sources = ['boss', 'qcwy', 'zhilian']
// Get all unique timestamps
const times = [...new Set(trendData.value.map(d => d.time))].sort()
const series = sources.map(source => {
const times = [...new Set(trendData.value.map((d) => d.time))].sort()
const series = sources.map((source) => {
return {
name: source === 'boss' ? 'Boss直聘' : source === 'qcwy' ? '前程无忧' : '智联招聘',
type: 'line',
smooth: true,
data: times.map(t => {
const item = trendData.value.find(d => d.time === t && d.source === source)
data: times.map((t) => {
const item = trendData.value.find((d) => d.time === t && d.source === source)
return item ? item.count : 0
})
}),
}
})
const option = {
tooltip: {
trigger: 'axis'
trigger: 'axis',
},
legend: {
data: ['Boss直聘', '前程无忧', '智联招聘']
data: ['Boss直聘', '前程无忧', '智联招聘'],
},
dataZoom: [
{ type: 'slider', realtime: true },
{ type: 'inside' }
],
dataZoom: [{ type: 'slider', realtime: true }, { type: 'inside' }],
grid: {
left: '3%',
right: '4%',
bottom: '3%',
containLabel: true
containLabel: true,
},
xAxis: {
type: 'category',
boundaryGap: false,
data: times.map(t => {
const d = new Date(t)
if (query.value.interval === 'hour') {
return `${d.getMonth()+1}-${d.getDate()} ${d.getHours()}:00`
}
if (query.value.interval === 'week') {
const m = d.getMonth()+1
const day = d.getDate()
return `${d.getFullYear()}-${m}-${day}`
}
if (query.value.interval === 'month') {
const m = d.getMonth()+1
return `${d.getFullYear()}-${m}`
}
return d.toISOString().split('T')[0]
})
data: times.map((t) => {
const d = new Date(t)
if (query.value.interval === 'hour') {
return `${d.getMonth() + 1}-${d.getDate()} ${d.getHours()}:00`
}
if (query.value.interval === 'week') {
const m = d.getMonth() + 1
const day = d.getDate()
return `${d.getFullYear()}-${m}-${day}`
}
if (query.value.interval === 'month') {
const m = d.getMonth() + 1
return `${d.getFullYear()}-${m}`
}
return d.toISOString().split('T')[0]
}),
},
yAxis: {
type: 'value'
type: 'value',
},
series: series
series: series,
}
trendChart.setOption(option)
}
const renderSourceChart = () => {
if (!sourceChartRef.value) return
if (!sourceChart) sourceChart = echarts.init(sourceChartRef.value)
const option = {
tooltip: {
trigger: 'item'
trigger: 'item',
},
legend: {
top: '5%',
left: 'center'
left: 'center',
},
series: [
{
@ -331,30 +328,35 @@ const renderSourceChart = () => {
itemStyle: {
borderRadius: 10,
borderColor: '#fff',
borderWidth: 2
borderWidth: 2,
},
label: {
show: false,
position: 'center'
position: 'center',
},
emphasis: {
label: {
show: true,
fontSize: 20,
fontWeight: 'bold'
}
fontWeight: 'bold',
},
},
labelLine: {
show: false
show: false,
},
data: sourceDistribution.value.map(item => ({
data: sourceDistribution.value.map((item) => ({
value: item.job_count,
name: item.category === 'boss' ? 'Boss直聘' : item.category === 'qcwy' ? '前程无忧' : '智联招聘'
}))
}
]
name:
item.category === 'boss'
? 'Boss直聘'
: item.category === 'qcwy'
? '前程无忧'
: '智联招聘',
})),
},
],
}
sourceChart.setOption(option)
}

View File

@ -1,119 +1,126 @@
<template>
<CommonPage title="定向数据清洗">
<div class="h-full flex flex-col">
<div class="mb-4 flex justify-between items-center bg-white p-4 rounded shadow-sm dark:bg-gray-800">
<n-space>
<n-button type="primary" @click="showUploadModal = true">
<template #icon><TheIcon icon="mdi:cloud-upload" /></template>
上传数据
</n-button>
<n-popconfirm @positive-click="handleClear">
<template #trigger>
<n-button type="error">
<template #icon><TheIcon icon="mdi:delete-forever" /></template>
清空所有
</n-button>
</template>
确定要清空所有清洗任务吗
</n-popconfirm>
</n-space>
</div>
<CrudTable
ref="$table"
v-model:query-items="queryItems"
:columns="columns"
:get-data="api.getCleaningTasks"
>
<template #queryBar>
<QueryBarItem label="状态" :label-width="60">
<n-select
v-model:value="queryItems.status"
:options="statusOptions"
placeholder="状态"
clearable
style="width: 200px"
/>
</QueryBarItem>
<QueryBarItem label="清洗模式" :label-width="80">
<n-select
v-model:value="queryItems.clean_type"
:options="cleanTypeOptions"
placeholder="清洗模式"
clearable
style="width: 220px"
/>
</QueryBarItem>
<QueryBarItem label="目标" :label-width="50">
<n-input
v-model:value="queryItems.target"
placeholder="搜索目标"
clearable
@keyup.enter="$table?.handleSearch()"
style="width: 260px"
/>
</QueryBarItem>
<div
class="mb-4 flex items-center justify-between rounded bg-white p-4 shadow-sm dark:bg-gray-800"
>
<n-space>
<n-button type="primary" @click="showUploadModal = true">
<template #icon><TheIcon icon="mdi:cloud-upload" /></template>
上传数据
</n-button>
<n-popconfirm @positive-click="handleClear">
<template #trigger>
<n-button type="error">
<template #icon><TheIcon icon="mdi:delete-forever" /></template>
清空所有
</n-button>
</template>
</CrudTable>
确定要清空所有清洗任务吗
</n-popconfirm>
</n-space>
</div>
<!-- Upload Modal -->
<n-modal v-model:show="showUploadModal" preset="card" title="上传待清洗数据" style="width: 600px">
<n-form label-placement="left" label-width="100">
<n-form-item label="清洗模式">
<div class="flex flex-col w-full gap-2">
<n-select v-model:value="uploadForm.cleanType" :options="cleanTypeOptions" />
<div v-if="currentExample" class="text-xs text-gray-500">
<n-a :href="currentExample.url" target="_blank" download class="flex items-center">
<TheIcon icon="mdi:download" class="mr-1" />
下载{{ currentExample.name }}示例文件
</n-a>
</div>
</div>
</n-form-item>
<n-form-item label="目标平台">
<n-select v-model:value="uploadForm.platform" :options="platformOptions" />
</n-form-item>
<n-form-item label="代理地址">
<div class="flex flex-col w-full gap-2">
<n-select
v-model:value="selectedProxyId"
:options="proxyOptions"
placeholder="选择已配置的代理(可选)"
clearable
@update:value="handleProxySelect"
/>
<n-input
v-model:value="uploadForm.proxy"
placeholder="例如 http://user:pass@ip:port留空则使用默认"
/>
</div>
</n-form-item>
<n-form-item label="文件">
<n-upload
directory-dnd
:custom-request="customUploadRequest"
:max="1"
accept=".txt,.csv,.xlsx"
>
<n-upload-dragger>
<div style="margin-bottom: 12px">
<TheIcon icon="mdi:cloud-upload" :size="48" />
</div>
<n-text style="font-size: 16px">点击或拖动文件上传</n-text>
<n-p depth="3" style="margin: 8px 0 0 0">
支持 .txt, .csv, .xlsx 文件每行一个目标<br>
请确保文件内容与选择的清洗模式和平台一致
</n-p>
</n-upload-dragger>
</n-upload>
</n-form-item>
</n-form>
</n-modal>
<!-- Detail Modal -->
<n-modal v-model:show="showDetailModal" preset="card" title="结果详情" style="width: 600px">
<n-code :code="detailJson" language="json" word-wrap />
</n-modal>
<CrudTable
ref="$table"
v-model:query-items="queryItems"
:columns="columns"
:get-data="api.getCleaningTasks"
>
<template #queryBar>
<QueryBarItem label="状态" :label-width="60">
<n-select
v-model:value="queryItems.status"
:options="statusOptions"
placeholder="状态"
clearable
style="width: 200px"
/>
</QueryBarItem>
<QueryBarItem label="清洗模式" :label-width="80">
<n-select
v-model:value="queryItems.clean_type"
:options="cleanTypeOptions"
placeholder="清洗模式"
clearable
style="width: 220px"
/>
</QueryBarItem>
<QueryBarItem label="目标" :label-width="50">
<n-input
v-model:value="queryItems.target"
placeholder="搜索目标"
clearable
style="width: 260px"
@keyup.enter="$table?.handleSearch()"
/>
</QueryBarItem>
</template>
</CrudTable>
<!-- Upload Modal -->
<n-modal
v-model:show="showUploadModal"
preset="card"
title="上传待清洗数据"
style="width: 600px"
>
<n-form label-placement="left" label-width="100">
<n-form-item label="清洗模式">
<div class="w-full flex flex-col gap-2">
<n-select v-model:value="uploadForm.cleanType" :options="cleanTypeOptions" />
<div v-if="currentExample" class="text-xs text-gray-500">
<n-a :href="currentExample.url" target="_blank" download class="flex items-center">
<TheIcon icon="mdi:download" class="mr-1" />
下载{{ currentExample.name }}示例文件
</n-a>
</div>
</div>
</n-form-item>
<n-form-item label="目标平台">
<n-select v-model:value="uploadForm.platform" :options="platformOptions" />
</n-form-item>
<n-form-item label="代理地址">
<div class="w-full flex flex-col gap-2">
<n-select
v-model:value="selectedProxyId"
:options="proxyOptions"
placeholder="选择已配置的代理(可选)"
clearable
@update:value="handleProxySelect"
/>
<n-input
v-model:value="uploadForm.proxy"
placeholder="例如 http://user:pass@ip:port留空则使用默认"
/>
</div>
</n-form-item>
<n-form-item label="文件">
<n-upload
directory-dnd
:custom-request="customUploadRequest"
:max="1"
accept=".txt,.csv,.xlsx"
>
<n-upload-dragger>
<div style="margin-bottom: 12px">
<TheIcon icon="mdi:cloud-upload" :size="48" />
</div>
<n-text style="font-size: 16px">点击或拖动文件上传</n-text>
<n-p depth="3" style="margin: 8px 0 0 0">
支持 .txt, .csv, .xlsx 文件每行一个目标<br />
请确保文件内容与选择的清洗模式和平台一致
</n-p>
</n-upload-dragger>
</n-upload>
</n-form-item>
</n-form>
</n-modal>
<!-- Detail Modal -->
<n-modal v-model:show="showDetailModal" preset="card" title="结果详情" style="width: 600px">
<n-code :code="detailJson" language="json" word-wrap />
</n-modal>
</div>
</CommonPage>
</template>
@ -137,81 +144,81 @@ const showDetailModal = ref(false)
const detailJson = ref('')
const uploadForm = reactive({
cleanType: 'auto',
platform: 'auto',
proxy: ''
cleanType: 'auto',
platform: 'auto',
proxy: '',
})
const selectedProxyId = ref(null)
const allProxies = ref([])
const proxyOptions = computed(() => {
if (!allProxies.value || !allProxies.value.length) return []
const platform = uploadForm.platform
return allProxies.value
.filter((item) => {
if (!item.is_active) return false
if (platform === 'auto') return true
return item.platform === 'all' || item.platform === platform
})
.map((item) => ({
label: `${item.name} (${item.proxy_type.toUpperCase()})`,
value: item.id
}))
if (!allProxies.value || !allProxies.value.length) return []
const platform = uploadForm.platform
return allProxies.value
.filter((item) => {
if (!item.is_active) return false
if (platform === 'auto') return true
return item.platform === 'all' || item.platform === platform
})
.map((item) => ({
label: `${item.name} (${item.proxy_type.toUpperCase()})`,
value: item.id,
}))
})
const loadProxies = async () => {
try {
const res = await proxyApi.list({ page: 1, page_size: 100, is_active: true })
allProxies.value = res.data || res.data?.data || []
} catch (e) {
allProxies.value = []
}
try {
const res = await proxyApi.list({ page: 1, page_size: 100, is_active: true })
allProxies.value = res.data || res.data?.data || []
} catch (e) {
allProxies.value = []
}
}
const handleProxySelect = (val) => {
selectedProxyId.value = val
if (!val) return
const found = allProxies.value.find((item) => item.id === val)
if (found) {
uploadForm.proxy = found.proxy_url
}
selectedProxyId.value = val
if (!val) return
const found = allProxies.value.find((item) => item.id === val)
if (found) {
uploadForm.proxy = found.proxy_url
}
}
const currentExample = computed(() => {
const type = uploadForm.cleanType
const platform = uploadForm.platform
let url = ''
let name = ''
if (type === 'company_name') {
name = '公司名称'
url = '/static/examples/company_names.txt'
} else if (type === 'company_id') {
name = '公司ID'
url = '/static/examples/company_ids.txt'
} else if (type === 'clean_url') {
name = 'URL链接'
url = '/static/examples/boss_urls.txt'
}
// Platform specific overrides
if (platform === 'boss') {
if (type === 'company_id') url = '/static/examples/boss_com_ids.txt'
if (type === 'company_name') url = '/static/examples/boss_com_names.txt'
if (type === 'clean_url') url = '/static/examples/boss_urls.txt'
} else if (platform === 'qcwy') {
if (type === 'company_id') url = '/static/examples/qcwy_com_ids.txt'
if (type === 'company_name') url = '/static/examples/qcwy_com_names.txt'
if (type === 'clean_url') url = '/static/examples/qcwy_urls.txt'
} else if (platform === 'zhilian') {
if (type === 'company_id') url = '/static/examples/zhaopin_com_ids.txt'
if (type === 'company_name') url = '/static/examples/zhaopin_com_names.txt'
if (type === 'clean_url') url = '/static/examples/zhaopin_urls.txt'
}
if (!url) return null
return { name, url }
const type = uploadForm.cleanType
const platform = uploadForm.platform
let url = ''
let name = ''
if (type === 'company_name') {
name = '公司名称'
url = '/static/examples/company_names.txt'
} else if (type === 'company_id') {
name = '公司ID'
url = '/static/examples/company_ids.txt'
} else if (type === 'clean_url') {
name = 'URL链接'
url = '/static/examples/boss_urls.txt'
}
// Platform specific overrides
if (platform === 'boss') {
if (type === 'company_id') url = '/static/examples/boss_com_ids.txt'
if (type === 'company_name') url = '/static/examples/boss_com_names.txt'
if (type === 'clean_url') url = '/static/examples/boss_urls.txt'
} else if (platform === 'qcwy') {
if (type === 'company_id') url = '/static/examples/qcwy_com_ids.txt'
if (type === 'company_name') url = '/static/examples/qcwy_com_names.txt'
if (type === 'clean_url') url = '/static/examples/qcwy_urls.txt'
} else if (platform === 'zhilian') {
if (type === 'company_id') url = '/static/examples/zhaopin_com_ids.txt'
if (type === 'company_name') url = '/static/examples/zhaopin_com_names.txt'
if (type === 'clean_url') url = '/static/examples/zhaopin_urls.txt'
}
if (!url) return null
return { name, url }
})
const cleanTypeOptions = [
@ -219,207 +226,228 @@ const cleanTypeOptions = [
{ label: '公司名称清洗', value: 'company_name' },
{ label: '公司ID清洗', value: 'company_id' },
{ label: 'URL清洗', value: 'clean_url' },
{ label: '公司Jobs清洗', value: 'company_jobs' }
{ label: '公司Jobs清洗', value: 'company_jobs' },
]
const platformOptions = [
{ label: '自动识别', value: 'auto' },
{ label: 'Boss直聘', value: 'boss' },
{ label: '前程无忧', value: 'qcwy' },
{ label: '智联招聘', value: 'zhilian' }
{ label: '智联招聘', value: 'zhilian' },
]
const statusOptions = [
{ label: '待处理', value: 'pending' },
{ label: '处理中', value: 'processing' },
{ label: '成功', value: 'success' },
{ label: '失败', value: 'fail' }
{ label: '待处理', value: 'pending' },
{ label: '处理中', value: 'processing' },
{ label: '成功', value: 'success' },
{ label: '失败', value: 'fail' },
]
const customUploadRequest = async ({ file, onFinish, onError }) => {
const formData = new FormData()
formData.append('file', file.file)
formData.append('clean_type', uploadForm.cleanType)
formData.append('platform', uploadForm.platform)
if (uploadForm.proxy) {
formData.append('proxy', uploadForm.proxy)
}
try {
await api.uploadCleaningFile(formData)
message.success('上传成功')
showUploadModal.value = false
$table.value?.handleSearch()
onFinish()
} catch (error) {
message.error('上传失败')
onError()
}
const formData = new FormData()
formData.append('file', file.file)
formData.append('clean_type', uploadForm.cleanType)
formData.append('platform', uploadForm.platform)
if (uploadForm.proxy) {
formData.append('proxy', uploadForm.proxy)
}
try {
await api.uploadCleaningFile(formData)
message.success('上传成功')
showUploadModal.value = false
$table.value?.handleSearch()
onFinish()
} catch (error) {
message.error('上传失败')
onError()
}
}
const handleClear = async () => {
try {
await api.clearCleaningTasks()
message.success('已清空')
$table.value?.handleSearch()
} catch (error) {
message.error('操作失败')
}
try {
await api.clearCleaningTasks()
message.success('已清空')
$table.value?.handleSearch()
} catch (error) {
message.error('操作失败')
}
}
const handleProcess = async (row) => {
try {
row.processing = true // Optimistic UI update if possible, or just loading state
await api.processCleaningTask(row.id)
message.success('处理完成')
$table.value?.handleSearch()
} catch (error) {
message.error('处理失败')
} finally {
row.processing = false
}
try {
row.processing = true // Optimistic UI update if possible, or just loading state
await api.processCleaningTask(row.id)
message.success('处理完成')
$table.value?.handleSearch()
} catch (error) {
message.error('处理失败')
} finally {
row.processing = false
}
}
const handleDelete = async (row) => {
try {
await api.deleteCleaningTask({ id: row.id })
message.success('删除成功')
$table.value?.handleSearch()
} catch (error) {
message.error('删除失败')
}
try {
await api.deleteCleaningTask({ id: row.id })
message.success('删除成功')
$table.value?.handleSearch()
} catch (error) {
message.error('删除失败')
}
}
const showDetail = (row) => {
try {
const displayData = {
result: row.result_summary,
original_data: row.original_data,
error: row.error_msg
}
// Filter out null/undefined values for cleaner display
const cleanData = Object.fromEntries(
Object.entries(displayData).filter(([_, v]) => v != null)
)
detailJson.value = JSON.stringify(cleanData, null, 2)
showDetailModal.value = true
} catch (e) {
detailJson.value = '{}'
try {
const displayData = {
result: row.result_summary,
original_data: row.original_data,
error: row.error_msg,
}
// Filter out null/undefined values for cleaner display
const cleanData = Object.fromEntries(Object.entries(displayData).filter(([_, v]) => v != null))
detailJson.value = JSON.stringify(cleanData, null, 2)
showDetailModal.value = true
} catch (e) {
detailJson.value = '{}'
}
}
const columns = [
{ title: 'ID', key: 'id', width: 60, align: 'center' },
{
title: '平台',
key: 'platform',
width: 100,
render(row) {
const map = {
'boss': 'Boss直聘',
'qcwy': '前程无忧',
'zhilian': '智联招聘',
'auto': '自动识别'
}
return map[row.platform] || row.platform
}
{ title: 'ID', key: 'id', width: 60, align: 'center' },
{
title: '平台',
key: 'platform',
width: 100,
render(row) {
const map = {
boss: 'Boss直聘',
qcwy: '前程无忧',
zhilian: '智联招聘',
auto: '自动识别',
}
return map[row.platform] || row.platform
},
{ title: '目标', key: 'target', width: 200, ellipsis: { tooltip: true } },
{ title: '代理', key: 'proxy', width: 220, ellipsis: { tooltip: true } },
{
title: '清洗模式',
key: 'clean_type',
width: 100,
render(row) {
const opt = cleanTypeOptions.find(o => o.value === row.clean_type)
return opt ? opt.label : row.clean_type
}
},
{ title: '目标', key: 'target', width: 200, ellipsis: { tooltip: true } },
{ title: '代理', key: 'proxy', width: 220, ellipsis: { tooltip: true } },
{
title: '清洗模式',
key: 'clean_type',
width: 100,
render(row) {
const opt = cleanTypeOptions.find((o) => o.value === row.clean_type)
return opt ? opt.label : row.clean_type
},
{
title: '状态',
key: 'status',
width: 100,
align: 'center',
render(row) {
const statusMap = {
pending: { type: 'default', text: '待处理' },
processing: { type: 'info', text: '处理中' },
success: { type: 'success', text: '成功' },
fail: { type: 'error', text: '失败' }
}
const s = statusMap[row.status] || { type: 'default', text: row.status }
return h(NTag, { type: s.type, bordered: false }, { default: () => s.text })
}
},
{
title: '状态',
key: 'status',
width: 100,
align: 'center',
render(row) {
const statusMap = {
pending: { type: 'default', text: '待处理' },
processing: { type: 'info', text: '处理中' },
success: { type: 'success', text: '成功' },
fail: { type: 'error', text: '失败' },
}
const s = statusMap[row.status] || { type: 'default', text: row.status }
return h(NTag, { type: s.type, bordered: false }, { default: () => s.text })
},
{
title: '存储状态',
key: 'storage_status',
width: 120,
align: 'center',
render(row) {
const map = {
saved: { type: 'success', text: '已入库' },
duplicate: { type: 'warning', text: '重复跳过' },
failed: { type: 'error', text: '入库失败' },
unknown: { type: 'default', text: '-' }
}
const s = map[row.storage_status] || { type: 'default', text: row.storage_status }
return h(NTag, { type: s.type, bordered: false, size: 'small' }, { default: () => s.text })
}
},
{
title: '存储状态',
key: 'storage_status',
width: 120,
align: 'center',
render(row) {
const map = {
saved: { type: 'success', text: '已入库' },
duplicate: { type: 'warning', text: '重复跳过' },
failed: { type: 'error', text: '入库失败' },
unknown: { type: 'default', text: '-' },
}
const s = map[row.storage_status] || { type: 'default', text: row.storage_status }
return h(NTag, { type: s.type, bordered: false, size: 'small' }, { default: () => s.text })
},
{
title: '远程推送',
key: 'remote_sent',
width: 100,
align: 'center',
render(row) {
if (row.status !== 'success') return '-'
return row.remote_sent
? h(NTag, { type: 'success', size: 'small' }, { default: () => '已发送' })
: h(NTag, { type: 'default', size: 'small' }, { default: () => '发送' })
}
},
{
title: '远程推送',
key: 'remote_sent',
width: 100,
align: 'center',
render(row) {
if (row.status !== 'success') return '-'
return row.remote_sent
? h(NTag, { type: 'success', size: 'small' }, { default: () => '发送' })
: h(NTag, { type: 'default', size: 'small' }, { default: () => '未发送' })
},
{
title: '操作',
key: 'actions',
width: 200,
align: 'center',
fixed: 'right',
render(row) {
return h(NSpace, { justify: 'center' }, {
default: () => [
h(NButton, {
size: 'small',
type: 'primary',
ghost: true,
disabled: row.status === 'processing',
onClick: () => handleProcess(row)
}, { default: () => row.status === 'success' ? '重试' : '执行' }),
(row.status === 'success' || row.status === 'fail') ? h(NButton, {
size: 'small',
onClick: () => showDetail(row)
}, { default: () => '详情' }) : null,
},
{
title: '操作',
key: 'actions',
width: 200,
align: 'center',
fixed: 'right',
render(row) {
return h(
NSpace,
{ justify: 'center' },
{
default: () => [
h(
NButton,
{
size: 'small',
type: 'primary',
ghost: true,
disabled: row.status === 'processing',
onClick: () => handleProcess(row),
},
{ default: () => (row.status === 'success' ? '重试' : '执行') }
),
h(NPopconfirm, {
onPositiveClick: () => handleDelete(row)
}, {
trigger: () => h(NButton, {
size: 'small',
type: 'error',
text: true,
}, { default: () => '删除', icon: renderIcon('mdi:delete', { size: 16 }) }),
default: () => '确定删除该任务吗?'
})
]
})
row.status === 'success' || row.status === 'fail'
? h(
NButton,
{
size: 'small',
onClick: () => showDetail(row),
},
{ default: () => '详情' }
)
: null,
h(
NPopconfirm,
{
onPositiveClick: () => handleDelete(row),
},
{
trigger: () =>
h(
NButton,
{
size: 'small',
type: 'error',
text: true,
},
{ default: () => '删除', icon: renderIcon('mdi:delete', { size: 16 }) }
),
default: () => '确定删除该任务吗?',
}
),
],
}
}
)
},
},
]
onMounted(() => {
loadProxies()
$table.value?.handleSearch()
loadProxies()
$table.value?.handleSearch()
})
</script>

View File

@ -1,4 +1,3 @@
const Layout = () => import('@/layout/index.vue')
export default {

View File

@ -1,6 +1,18 @@
<script setup>
import { h, ref, watch, onMounted } from 'vue'
import { NButton, NInput, NTabs, NTabPane, NPopconfirm, NForm, NFormItem, NGrid, NGridItem, NCard, NStatistic } from 'naive-ui'
import {
NButton,
NInput,
NTabs,
NTabPane,
NPopconfirm,
NForm,
NFormItem,
NGrid,
NGridItem,
NCard,
NStatistic,
} from 'naive-ui'
import CommonPage from '@/components/page/CommonPage.vue'
import QueryBarItem from '@/components/query-bar/QueryBarItem.vue'
import CrudModal from '@/components/table/CrudModal.vue'
@ -42,22 +54,22 @@ const getList = (params) => {
}
const doCreate = (data) => {
return api.create(data, { source: activeTab.value }).then(res => {
fetchStats()
return res
return api.create(data, { source: activeTab.value }).then((res) => {
fetchStats()
return res
})
}
const doUpdate = (data) => {
const { id, ...rest } = data
return api.update({ id, source: activeTab.value }, rest)
const { id, ...rest } = data
return api.update({ id, source: activeTab.value }, rest)
}
const doDelete = (data) => {
return api.delete({ id: data.id, source: activeTab.value }).then(res => {
fetchStats()
return res
})
return api.delete({ id: data.id, source: activeTab.value }).then((res) => {
fetchStats()
return res
})
}
const {
@ -88,23 +100,23 @@ const columns = [
{ title: 'ID', key: 'id', width: 60, align: 'center' },
{ title: '城市', key: 'city', width: 100, align: 'center' },
{ title: '职位关键词', key: 'job', width: 150, align: 'center' },
{
title: '最后请求日期',
key: 'last_requested_date',
width: 120,
align: 'center',
render(row) {
return row.last_requested_date || '-'
}
{
title: '最后请求日期',
key: 'last_requested_date',
width: 120,
align: 'center',
render(row) {
return row.last_requested_date || '-'
},
},
{
title: '最后请求时间',
key: 'last_requested_at',
width: 150,
align: 'center',
render(row) {
return row.last_requested_at ? formatDate(row.last_requested_at) : '-'
}
{
title: '最后请求时间',
key: 'last_requested_at',
width: 150,
align: 'center',
render(row) {
return row.last_requested_at ? formatDate(row.last_requested_at) : '-'
},
},
{
title: '操作',
@ -134,7 +146,10 @@ const columns = [
h(
NButton,
{ size: 'small', type: 'error' },
{ default: () => '删除', icon: renderIcon('material-symbols:delete-outline', { size: 16 }) }
{
default: () => '删除',
icon: renderIcon('material-symbols:delete-outline', { size: 16 }),
}
),
default: () => '确定删除该关键词吗?',
}
@ -153,11 +168,11 @@ const columns = [
<n-card>
<n-statistic label="Boss直聘">
<template #prefix>
<div class="text-sm font-semibold text-gray-700">已用 / 总数</div>
<div class="text-sm text-gray-700 font-semibold">已用 / 总数</div>
</template>
{{ overviewStats.boss.used }} / {{ overviewStats.boss.total }}
<template #suffix>
<span class="text-xs text-green-600" v-if="overviewStats.boss.total > 0">
<span v-if="overviewStats.boss.total > 0" class="text-xs text-green-600">
{{ ((overviewStats.boss.used / overviewStats.boss.total) * 100).toFixed(1) }}%
</span>
</template>
@ -168,11 +183,11 @@ const columns = [
<n-card>
<n-statistic label="前程无忧">
<template #prefix>
<div class="text-sm font-semibold text-gray-700">已用 / 总数</div>
<div class="text-sm text-gray-700 font-semibold">已用 / 总数</div>
</template>
{{ overviewStats.qcwy.used }} / {{ overviewStats.qcwy.total }}
<template #suffix>
<span class="text-xs text-green-600" v-if="overviewStats.qcwy.total > 0">
<span v-if="overviewStats.qcwy.total > 0" class="text-xs text-green-600">
{{ ((overviewStats.qcwy.used / overviewStats.qcwy.total) * 100).toFixed(1) }}%
</span>
</template>
@ -183,12 +198,14 @@ const columns = [
<n-card>
<n-statistic label="智联招聘">
<template #prefix>
<div class="text-sm font-semibold text-gray-700">已用 / 总数</div>
<div class="text-sm text-gray-700 font-semibold">已用 / 总数</div>
</template>
{{ overviewStats.zhilian.used }} / {{ overviewStats.zhilian.total }}
<template #suffix>
<span class="text-xs text-green-600" v-if="overviewStats.zhilian.total > 0">
{{ ((overviewStats.zhilian.used / overviewStats.zhilian.total) * 100).toFixed(1) }}%
<span v-if="overviewStats.zhilian.total > 0" class="text-xs text-green-600">
{{
((overviewStats.zhilian.used / overviewStats.zhilian.total) * 100).toFixed(1)
}}%
</span>
</template>
</n-statistic>
@ -196,8 +213,8 @@ const columns = [
</n-grid-item>
</n-grid>
<div class="bg-white p-4 rounded-lg shadow-sm">
<div class="flex justify-between items-center mb-4">
<div class="rounded-lg bg-white p-4 shadow-sm">
<div class="mb-4 flex items-center justify-between">
<NTabs v-model:value="activeTab" type="line" animated>
<NTabPane name="boss" tab="Boss直聘" />
<NTabPane name="qcwy" tab="前程无忧" />
@ -251,10 +268,18 @@ const columns = [
:label-width="80"
:model="modalForm"
>
<NFormItem label="城市" path="city" :rule="{ required: true, message: '请输入城市', trigger: ['input', 'blur'] }">
<NFormItem
label="城市"
path="city"
:rule="{ required: true, message: '请输入城市', trigger: ['input', 'blur'] }"
>
<NInput v-model:value="modalForm.city" clearable placeholder="请输入城市" />
</NFormItem>
<NFormItem label="职位" path="job" :rule="{ required: true, message: '请输入职位', trigger: ['input', 'blur'] }">
<NFormItem
label="职位"
path="job"
:rule="{ required: true, message: '请输入职位', trigger: ['input', 'blur'] }"
>
<NInput v-model:value="modalForm.job" clearable placeholder="请输入职位" />
</NFormItem>
</NForm>

View File

@ -1,146 +1,14 @@
<template>
<CommonPage title="Boss直聘数据">
<div class="h-full flex flex-col">
<CrudTable
ref="$table"
v-model:query-items="queryItems"
:columns="columns"
:get-data="getData"
>
<template #queryBar>
<n-form-item label="数据类型" label-placement="left">
<n-select
v-model:value="queryItems.data_type"
:options="dataTypeOptions"
style="width: 200px"
@update:value="$table?.handleSearch()"
/>
</n-form-item>
</template>
</CrudTable>
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
<n-code :code="detailJson" language="json" word-wrap />
</n-modal>
</div>
</CommonPage>
<PlatformData platform="boss" title="Boss直聘数据" :job-columns="jobColumns" />
</template>
<script setup>
import { ref, h, onMounted, computed } from 'vue'
import { NButton, NTag, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
import CommonPage from '@/components/page/CommonPage.vue'
import CrudTable from '@/components/table/CrudTable.vue'
import api from '@/api'
import PlatformData from '../components/PlatformData.vue'
const $table = ref(null)
const queryItems = ref({
data_type: 'job',
platform: 'boss'
})
const showDetailModal = ref(false)
const detailJson = ref('')
const dataTypeOptions = [
{ label: '职位', value: 'job' },
{ label: '公司', value: 'company' }
const jobColumns = [
{ title: 'ID', key: 'id', width: 100 },
{ title: '职位ID', key: 'job_id', width: 200 },
{ title: '创建时间', key: 'created_at', width: 200 },
{ title: '更新时间', key: 'updated_at', width: 200 },
]
const columns = computed(() => {
if (queryItems.value.data_type === 'job') {
return [
{
title: 'ID',
key: 'id',
width: 100
},
{
title: '职位ID',
key: 'job_id',
width: 200
},
{
title: '创建时间',
key: 'created_at',
width: 200
},
{
title: '更新时间',
key: 'updated_at',
width: 200
},
{
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(NButton, {
size: 'small',
onClick: () => {
// json_data row
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
}
}, { default: () => '详情' })
}
}
]
} else {
return [
{
title: 'ID',
key: 'id',
width: 100
},
{
title: '公司名称',
key: 'company_name',
width: 250
},
{
title: '创建时间',
key: 'created_at',
width: 200
},
{
title: '更新时间',
key: 'updated_at',
width: 200
},
{
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(NButton, {
size: 'small',
onClick: () => {
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
}
}, { default: () => '详情' })
}
}
]
}
})
const getData = async (params) => {
params.platform = 'boss'
const res = await api.queryPlatformData(params)
return {
data: res.data?.items || [],
total: res.data?.total || 0
}
}
onMounted(() => {
$table.value?.handleSearch()
})
</script>

View File

@ -0,0 +1,112 @@
<template>
<CommonPage :title="title">
<div class="h-full flex flex-col">
<CrudTable
ref="$table"
v-model:query-items="queryItems"
:columns="currentColumns"
:get-data="getData"
>
<template #queryBar>
<n-form-item label="数据类型" label-placement="left">
<n-select
v-model:value="queryItems.data_type"
:options="dataTypeOptions"
style="width: 200px"
@update:value="() => $table?.handleSearch()"
/>
</n-form-item>
</template>
</CrudTable>
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
<n-code :code="detailJson" language="json" word-wrap />
</n-modal>
</div>
</CommonPage>
</template>
<script setup>
import { ref, h, onMounted, computed, defineProps } from 'vue'
import { NButton, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
import CommonPage from '@/components/page/CommonPage.vue'
import CrudTable from '@/components/table/CrudTable.vue'
import api from '@/api'
const props = defineProps({
platform: {
type: String,
required: true,
},
title: {
type: String,
required: true,
},
jobColumns: {
type: Array,
required: true,
},
})
const $table = ref(null)
const queryItems = ref({
data_type: 'job',
platform: props.platform,
})
const showDetailModal = ref(false)
const detailJson = ref('')
const dataTypeOptions = [
{ label: '职位', value: 'job' },
{ label: '公司', value: 'company' },
]
const renderActionColumn = () => ({
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(
NButton,
{
size: 'small',
onClick: () => {
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
},
},
{ default: () => '详情' }
)
},
})
const currentColumns = computed(() => {
if (queryItems.value.data_type === 'job') {
return [...props.jobColumns, renderActionColumn()]
} else {
return [
{ title: 'ID', key: 'id', width: 100 },
{ title: '公司名称', key: 'company_name', width: 250 },
{ title: '创建时间', key: 'created_at', width: 200 },
{ title: '更新时间', key: 'updated_at', width: 200 },
renderActionColumn(),
]
}
})
const getData = async (params) => {
params.platform = props.platform
const res = await api.queryPlatformData(params)
return {
data: res.data?.items || [],
total: res.data?.total || 0,
}
}
onMounted(() => {
$table.value?.handleSearch()
})
</script>

View File

@ -1,151 +1,15 @@
<template>
<CommonPage title="前程无忧数据">
<div class="h-full flex flex-col">
<CrudTable
ref="$table"
v-model:query-items="queryItems"
:columns="columns"
:get-data="getData"
>
<template #queryBar>
<n-form-item label="数据类型" label-placement="left">
<n-select
v-model:value="queryItems.data_type"
:options="dataTypeOptions"
style="width: 200px"
@update:value="$table?.handleSearch()"
/>
</n-form-item>
</template>
</CrudTable>
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
<n-code :code="detailJson" language="json" word-wrap />
</n-modal>
</div>
</CommonPage>
<PlatformData platform="qcwy" title="前程无忧数据" :job-columns="jobColumns" />
</template>
<script setup>
import { ref, h, onMounted, computed } from 'vue'
import { NButton, NTag, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
import CommonPage from '@/components/page/CommonPage.vue'
import CrudTable from '@/components/table/CrudTable.vue'
import api from '@/api'
import PlatformData from '../components/PlatformData.vue'
const $table = ref(null)
const queryItems = ref({
data_type: 'job',
platform: 'qcwy'
})
const showDetailModal = ref(false)
const detailJson = ref('')
const dataTypeOptions = [
{ label: '职位', value: 'job' },
{ label: '公司', value: 'company' }
const jobColumns = [
{ title: 'ID', key: 'id', width: 100 },
{ title: '职位ID', key: 'job_id', width: 200 },
{ title: '更新时间', key: 'update_date_time', width: 200 },
{ title: '创建时间', key: 'created_at', width: 200 },
{ title: '最后更新', key: 'updated_at', width: 200 },
]
const columns = computed(() => {
if (queryItems.value.data_type === 'job') {
return [
{
title: 'ID',
key: 'id',
width: 100
},
{
title: '职位ID',
key: 'job_id',
width: 200
},
{
title: '更新时间',
key: 'update_date_time',
width: 200
},
{
title: '创建时间',
key: 'created_at',
width: 200
},
{
title: '最后更新',
key: 'updated_at',
width: 200
},
{
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(NButton, {
size: 'small',
onClick: () => {
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
}
}, { default: () => '详情' })
}
}
]
} else {
return [
{
title: 'ID',
key: 'id',
width: 100
},
{
title: '公司名称',
key: 'company_name',
width: 250
},
{
title: '创建时间',
key: 'created_at',
width: 200
},
{
title: '更新时间',
key: 'updated_at',
width: 200
},
{
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(NButton, {
size: 'small',
onClick: () => {
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
}
}, { default: () => '详情' })
}
}
]
}
})
const getData = async (params) => {
// Ensure platform is set
params.platform = 'qcwy'
const res = await api.queryPlatformData(params)
return {
data: res.data?.items || [],
total: res.data?.total || 0
}
}
onMounted(() => {
$table.value?.handleSearch()
})
</script>

View File

@ -1,150 +1,15 @@
<template>
<CommonPage title="智联招聘数据">
<div class="h-full flex flex-col">
<CrudTable
ref="$table"
v-model:query-items="queryItems"
:columns="columns"
:get-data="getData"
>
<template #queryBar>
<n-form-item label="数据类型" label-placement="left">
<n-select
v-model:value="queryItems.data_type"
:options="dataTypeOptions"
style="width: 200px"
@update:value="$table?.handleSearch()"
/>
</n-form-item>
</template>
</CrudTable>
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
<n-code :code="detailJson" language="json" word-wrap />
</n-modal>
</div>
</CommonPage>
<PlatformData platform="zhilian" title="智联招聘数据" :job-columns="jobColumns" />
</template>
<script setup>
import { ref, h, onMounted, computed } from 'vue'
import { NButton, NTag, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
import CommonPage from '@/components/page/CommonPage.vue'
import CrudTable from '@/components/table/CrudTable.vue'
import api from '@/api'
import PlatformData from '../components/PlatformData.vue'
const $table = ref(null)
const queryItems = ref({
data_type: 'job',
platform: 'zhilian'
})
const showDetailModal = ref(false)
const detailJson = ref('')
const dataTypeOptions = [
{ label: '职位', value: 'job' },
{ label: '公司', value: 'company' }
const jobColumns = [
{ title: 'ID', key: 'id', width: 100 },
{ title: '职位编号', key: 'number', width: 200 },
{ title: '发布时间', key: 'first_publish_time', width: 200 },
{ title: '创建时间', key: 'created_at', width: 200 },
{ title: '更新时间', key: 'updated_at', width: 200 },
]
const columns = computed(() => {
if (queryItems.value.data_type === 'job') {
return [
{
title: 'ID',
key: 'id',
width: 100
},
{
title: '职位编号',
key: 'number',
width: 200
},
{
title: '发布时间',
key: 'first_publish_time',
width: 200
},
{
title: '创建时间',
key: 'created_at',
width: 200
},
{
title: '更新时间',
key: 'updated_at',
width: 200
},
{
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(NButton, {
size: 'small',
onClick: () => {
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
}
}, { default: () => '详情' })
}
}
]
} else {
return [
{
title: 'ID',
key: 'id',
width: 100
},
{
title: '公司名称',
key: 'company_name',
width: 250
},
{
title: '创建时间',
key: 'created_at',
width: 200
},
{
title: '更新时间',
key: 'updated_at',
width: 200
},
{
title: '操作',
key: 'actions',
width: 100,
align: 'center',
fixed: 'right',
render(row) {
return h(NButton, {
size: 'small',
onClick: () => {
const content = row.json_data ? JSON.parse(row.json_data) : row
detailJson.value = JSON.stringify(content, null, 2)
showDetailModal.value = true
}
}, { default: () => '详情' })
}
}
]
}
})
const getData = async (params) => {
params.platform = 'zhilian'
const res = await api.queryPlatformData(params)
return {
data: res.data?.items || [],
total: res.data?.total || 0
}
}
onMounted(() => {
$table.value?.handleSearch()
})
</script>

View File

@ -81,7 +81,7 @@ const methodOptions = [
function formatJSON(data) {
try {
return typeof data === 'string'
return typeof data === 'string'
? JSON.stringify(JSON.parse(data), null, 2)
: JSON.stringify(data, null, 2)
} catch (e) {

View File

@ -5,7 +5,7 @@
ref="tableRef"
:columns="columns"
:query-items="queryItems"
:getData="handleGetList"
:get-data="handleGetList"
>
<template #queryBar>
<QueryBarItem label="wt2">
@ -24,7 +24,12 @@
</footer>
</n-card>
<CrudModal :title="modalTitle" :visible="modalVisible" :loading="modalLoading" @save="() => handleSave(tableRef?.handleSearch)">
<CrudModal
:title="modalTitle"
:visible="modalVisible"
:loading="modalLoading"
@save="() => handleSave(tableRef?.handleSearch)"
>
<n-form ref="modalFormRef" :model="modalForm" label-placement="left" label-width="90">
<n-form-item label="wt2" path="wt2" :rule="{ required: true, message: '请输入 wt2' }">
<n-input v-model:value="modalForm.wt2" />
@ -41,7 +46,6 @@
</n-form>
</CrudModal>
</AppPage>
</template>
<script setup>
@ -57,7 +61,12 @@ const columns = [
{ title: 'ID', key: 'id', width: 80 },
{ title: 'wt2', key: 'wt2', minWidth: 160 },
{ title: 'mpt', key: 'mpt', minWidth: 160 },
{ title: '启用', key: 'is_active', width: 90, render: (row) => h('span', row.is_active ? '是' : '否') },
{
title: '启用',
key: 'is_active',
width: 90,
render: (row) => h('span', row.is_active ? '是' : '否'),
},
{ title: '失败次数', key: 'failed_count', width: 100 },
{ title: '最后使用时间', key: 'last_used_time', minWidth: 180 },
{ title: '创建时间', key: 'created_at', minWidth: 180 },
@ -67,8 +76,16 @@ const columns = [
width: 160,
render(row) {
return h('div', { style: 'display:flex;gap:8px' }, [
h(NButton, { type: 'primary', size: 'small', onClick: () => handleEdit(row) }, { default: () => '编辑' }),
h(NButton, { size: 'small', onClick: () => handleDelete(row.id) }, { default: () => '删除' }),
h(
NButton,
{ type: 'primary', size: 'small', onClick: () => handleEdit(row) },
{ default: () => '编辑' }
),
h(
NButton,
{ size: 'small', onClick: () => handleDelete(row.id) },
{ default: () => '删除' }
),
])
},
},
@ -80,17 +97,26 @@ async function handleGetList(params) {
}
const initForm = { id: null, wt2: '', mpt: '', is_active: true, failed_count: 0 }
const { modalVisible, modalTitle, modalLoading, handleAdd, handleDelete, handleEdit, handleSave, modalForm, modalFormRef } = useCRUD({
const {
modalVisible,
modalTitle,
modalLoading,
handleAdd,
handleDelete,
handleEdit,
handleSave,
modalForm,
modalFormRef,
} = useCRUD({
name: 'Boss Token',
initForm,
doCreate: (data) => tokenApi.create(data),
doUpdate: (data) => tokenApi.update(data.id, data),
doDelete: (id) => tokenApi.remove(id),
refresh: () => tableRef?.handleSearch(),
refresh: () => tableRef.value?.handleSearch(),
})
onMounted(() => {
tableRef.value?.handleSearch()
})
</script>

View File

@ -204,40 +204,41 @@ const columns = [
default: () => h('div', {}, '确定删除该用户吗?'),
}
),
!row.is_superuser && h(
NPopconfirm,
{
onPositiveClick: async () => {
try {
await api.resetPassword({ user_id: row.id });
$message.success('密码已成功重置为123456');
await $table.value?.handleSearch();
} catch (error) {
$message.error('重置密码失败: ' + error.message);
}
!row.is_superuser &&
h(
NPopconfirm,
{
onPositiveClick: async () => {
try {
await api.resetPassword({ user_id: row.id })
$message.success('密码已成功重置为123456')
await $table.value?.handleSearch()
} catch (error) {
$message.error('重置密码失败: ' + error.message)
}
},
onNegativeClick: () => {},
},
onNegativeClick: () => {},
},
{
trigger: () =>
withDirectives(
h(
NButton,
{
size: 'small',
type: 'warning',
style: 'margin-right: 8px;',
},
{
default: () => '重置密码',
icon: renderIcon('material-symbols:lock-reset', { size: 16 }),
}
{
trigger: () =>
withDirectives(
h(
NButton,
{
size: 'small',
type: 'warning',
style: 'margin-right: 8px;',
},
{
default: () => '重置密码',
icon: renderIcon('material-symbols:lock-reset', { size: 16 }),
}
),
[[vPermission, 'post/api/v1/user/reset_password']]
),
[[vPermission, 'post/api/v1/user/reset_password']]
),
default: () => h('div', {}, '确定重置用户密码为123456吗?'),
}
),
default: () => h('div', {}, '确定重置用户密码为123456吗?'),
}
),
]
},
},
@ -363,11 +364,11 @@ const validateAddUser = {
<h1>部门列表</h1>
<br />
<NTree
block-line
:data="deptOption"
key-field="id"
label-field="name"
default-expand-all
block-line
:node-props="nodeProps"
>
</NTree>

View File

@ -30,7 +30,7 @@ export default defineConfig(({ command, mode }) => {
open: true,
proxy: VITE_USE_PROXY
? {
[VITE_BASE_API]: PROXY_CONFIG[VITE_BASE_API]
[VITE_BASE_API]: PROXY_CONFIG[VITE_BASE_API],
}
: undefined,
},

Some files were not shown because too many files have changed in this diff Show More