全部完成
This commit is contained in:
parent
45fba5697e
commit
78eee99c2f
116
.claude/index.json
Normal file
116
.claude/index.json
Normal file
@ -0,0 +1,116 @@
|
||||
{
|
||||
"generated_at": "2026-03-20T00:00:00+08:00",
|
||||
"project": "JobData - 招聘数据采集与统计分析平台",
|
||||
"scan_coverage": {
|
||||
"estimated_total_files": 220,
|
||||
"scanned_files": 35,
|
||||
"coverage_percent": 16,
|
||||
"note": "node_modules 已排除;覆盖全部核心业务文件,node_modules 约占总文件数 90% 以上"
|
||||
},
|
||||
"ignored_directories": [
|
||||
{ "path": "web/node_modules", "reason": ".gitignore 规则" },
|
||||
{ "path": "__pycache__", "reason": ".gitignore 规则" },
|
||||
{ "path": ".venv / venv", "reason": ".gitignore 规则" },
|
||||
{ "path": "migrations", "reason": ".gitignore 规则" },
|
||||
{ "path": "clickhouse_data / data", "reason": ".gitignore 规则" }
|
||||
],
|
||||
"modules": [
|
||||
{
|
||||
"path": "app",
|
||||
"language": "Python 3.13",
|
||||
"framework": "FastAPI 0.111 + Tortoise-ORM 0.23",
|
||||
"entry": "app/__init__.py",
|
||||
"startup_script": "run.py",
|
||||
"config": "app/settings/config.py",
|
||||
"api_prefix": "/api/v1",
|
||||
"routes": [
|
||||
"/base", "/user", "/role", "/menu", "/api", "/dept", "/auditlog",
|
||||
"/job", "/universal", "/token", "/proxy", "/stats", "/pipeline",
|
||||
"/keyword", "/cleaning", "/analytics", "/company"
|
||||
],
|
||||
"databases": {
|
||||
"mysql": "Tortoise-ORM, 表: user/role/api/menu/dept/auditlog/boss_token/cleaning_*/metrics_*",
|
||||
"clickhouse": "直接连接, 表: boss_job/boss_company/qcwy_job/qcwy_company/zhilian_job/zhilian_company/pending_company, 视图: job_analytics"
|
||||
},
|
||||
"scheduler_tasks": [
|
||||
"stats_job (每6h)", "ecs_full_pipeline (每6h)",
|
||||
"ip_alert_job (每10min)", "company_cleaning_job (每5min)",
|
||||
"daily_cleanup_job (每天00:05)"
|
||||
],
|
||||
"test_exists": false,
|
||||
"gaps": ["缺少单元测试", "缺少集成测试", "config.py 中有硬编码凭据"]
|
||||
},
|
||||
{
|
||||
"path": "web",
|
||||
"language": "JavaScript (Vue 3)",
|
||||
"framework": "Vue 3.3 + Vite 4 + Naive UI + Pinia",
|
||||
"entry": "web/src/main.js",
|
||||
"build_cmd": "pnpm build",
|
||||
"dev_cmd": "pnpm dev",
|
||||
"key_views": [
|
||||
"views/analytics/index.vue",
|
||||
"views/cleaning/index.vue",
|
||||
"views/recruitment/boss|qcwy|zhilian/index.vue",
|
||||
"views/system/*"
|
||||
],
|
||||
"api_layer": "web/src/api/",
|
||||
"state_management": "Pinia (user, permission, app, tags)",
|
||||
"test_exists": false,
|
||||
"gaps": ["缺少 Vitest 单元测试", "缺少 Playwright E2E 测试"]
|
||||
},
|
||||
{
|
||||
"path": "jobs_spider",
|
||||
"language": "Python",
|
||||
"sub_modules": ["boss", "qcwy", "zhilian"],
|
||||
"entry": {
|
||||
"boss": "jobs_spider/boss/boos_api.py (死循环主入口)",
|
||||
"qcwy": "jobs_spider/qcwy/run_company_search.py",
|
||||
"zhilian": "jobs_spider/zhilian/company_spider.py"
|
||||
},
|
||||
"push_api": "POST /api/v1/universal/data/batch-store-async",
|
||||
"anti_crawl": ["SmartIPManager (代理池轮换)", "随机延迟>=10s", "Session 重建", "Cookie 更新"],
|
||||
"test_exists": false,
|
||||
"gaps": ["缺少 IPAnomalyDetector 单元测试", "缺少数据解析函数测试"]
|
||||
},
|
||||
{
|
||||
"path": "ecs_full_pipeline.py",
|
||||
"language": "Python",
|
||||
"description": "阿里云 ECS 实例批量创建/销毁/命令下发全流程脚本",
|
||||
"entry": "ecs_full_pipeline.py",
|
||||
"cloud": "Alibaba Cloud ECS (cn-qingdao-b, ecs.n1.tiny, 抢占实例)",
|
||||
"gaps": ["AK/SK 在代码中硬编码(安全风险)"]
|
||||
}
|
||||
],
|
||||
"security_issues": [
|
||||
{
|
||||
"severity": "HIGH",
|
||||
"file": "app/settings/config.py",
|
||||
"issue": "MySQL 连接串(含密码)、ClickHouse 密码、SMTP 密码硬编码"
|
||||
},
|
||||
{
|
||||
"severity": "HIGH",
|
||||
"file": "ecs_full_pipeline.py",
|
||||
"issue": "阿里云 AK/SK 硬编码在 main() 函数中"
|
||||
},
|
||||
{
|
||||
"severity": "MEDIUM",
|
||||
"file": "app/settings/config.py",
|
||||
"issue": "SECRET_KEY = 'CHANGE_ME_DEV_ONLY',生产环境需替换"
|
||||
}
|
||||
],
|
||||
"coverage_gaps": [
|
||||
"所有模块均无自动化测试(单元/集成/E2E)",
|
||||
"jobs_spider/qcwy 和 jobs_spider/zhilian 未深度扫描",
|
||||
"app/services/company_cleaner.py 未扫描",
|
||||
"web/src/views/system/* 子目录未逐一扫描"
|
||||
],
|
||||
"next_scan_recommendations": [
|
||||
"补扫: app/services/company_cleaner.py",
|
||||
"补扫: jobs_spider/qcwy/qcwy.py",
|
||||
"补扫: jobs_spider/zhilian/zhilian_single.py",
|
||||
"补扫: web/src/views/system/ 各子目录",
|
||||
"补扫: app/core/algorithms/ 反爬虫算法",
|
||||
"补扫: app/api/v1/cleaning/ 清洗接口实现"
|
||||
],
|
||||
"truncated": false
|
||||
}
|
||||
371
.claude/plan/crawl-state-management.md
Normal file
371
.claude/plan/crawl-state-management.md
Normal file
@ -0,0 +1,371 @@
|
||||
# 爬虫状态管理系统设计
|
||||
|
||||
## Context
|
||||
|
||||
当前爬虫系统存在两个核心缺陷:
|
||||
|
||||
1. **关键词消费不可恢复**:`get_available()` 通过 `last_requested_date=today` 标记关键词已用,一旦爬虫崩溃,该关键词当天不会再被分配,已爬取的页面数据白白浪费。
|
||||
2. **分页状态无持久化**:所有 3 个爬虫的分页逻辑都在内存中(Boss MAX_PAGES=3, QCWY MAX_PAGES=50, Zhilian MAX_PAGES=15),网络异常或进程重启后无法从断点恢复。
|
||||
|
||||
用户需求:关键词用完后标记不再重复请求 + 记录分页进度实现断点续爬。
|
||||
|
||||
---
|
||||
|
||||
## 架构决策
|
||||
|
||||
1. **扩展现有 keyword 表**(而非新建表):crawl 状态与 keyword 是 1:1 日粒度关系,新建表增加 JOIN 开销且无收益
|
||||
2. **状态机驱动**:`crawl_status` 字段控制关键词生命周期,替代简单的 date 比较
|
||||
3. **服务端记录进度**:爬虫每完成一页向服务端汇报,而非本地记录(支持多机分布式爬取)
|
||||
4. **过期检测**:`crawling` 状态超时自动降级为 `partial`,防止僵死
|
||||
|
||||
---
|
||||
|
||||
## 实施步骤
|
||||
|
||||
### Step 1: 扩展 Keyword 模型
|
||||
|
||||
**修改** `app/models/keyword.py`
|
||||
|
||||
在 `BaseKeyword` 中新增字段:
|
||||
|
||||
```python
|
||||
class BaseKeyword(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
city = fields.CharField(max_length=64)
|
||||
job = fields.CharField(max_length=128)
|
||||
last_requested_date = fields.DateField(null=True)
|
||||
last_requested_at = fields.DatetimeField(null=True)
|
||||
|
||||
# --- 新增:爬取状态管理 ---
|
||||
crawl_status = fields.CharField(max_length=16, default="idle")
|
||||
# 状态值: idle / crawling / completed / failed / partial
|
||||
last_completed_page = fields.IntField(default=0) # 最后完成的页码
|
||||
total_pages = fields.IntField(default=0) # 发现的总页数(0=未知)
|
||||
jobs_found = fields.IntField(default=0) # 累计发现的职位数
|
||||
crawl_started_at = fields.DatetimeField(null=True) # 当次爬取开始时间
|
||||
crawler_id = fields.CharField(max_length=64, default="") # 爬虫实例标识
|
||||
error_message = fields.TextField(default="") # 最后错误信息
|
||||
retry_count = fields.IntField(default=0) # 当天重试次数
|
||||
|
||||
created_at = fields.DatetimeField(auto_now_add=True)
|
||||
updated_at = fields.DatetimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
```
|
||||
|
||||
状态机流转:
|
||||
|
||||
```
|
||||
idle ──(get_available)──► crawling ──(all pages done)──► completed
|
||||
│
|
||||
├──(spider reports error)──► failed ──(retry<3)──► crawling
|
||||
│
|
||||
└──(timeout 30min)──► partial ──(get_available)──► crawling
|
||||
|
||||
次日 00:00:所有状态 → idle(通过 last_requested_date != today 自动重置)
|
||||
```
|
||||
|
||||
### Step 2: 重写 get_available() 控制器
|
||||
|
||||
**修改** `app/controllers/keyword.py` 的 `get_available()` 方法
|
||||
|
||||
优先级调度逻辑(替代当前的简单 date 过滤):
|
||||
|
||||
```
|
||||
优先级 1: crawl_status='partial' AND last_requested_date=today (断点续爬)
|
||||
优先级 2: crawl_status='failed' AND retry_count<3 AND last_requested_date=today (失败重试)
|
||||
优先级 3: (last_requested_date!=today OR last_requested_date IS NULL) (全新关键词)
|
||||
```
|
||||
|
||||
返回值增加 `last_completed_page` 和 `crawl_status`,使爬虫知道从哪页开始:
|
||||
|
||||
```python
|
||||
items = [{
|
||||
"id": r.id,
|
||||
"city": r.city,
|
||||
"job": r.job,
|
||||
"last_completed_page": r.last_completed_page, # 新增
|
||||
"crawl_status": r.crawl_status, # 新增
|
||||
}]
|
||||
```
|
||||
|
||||
认领时原子更新:
|
||||
```python
|
||||
update_fields = {
|
||||
"last_requested_date": today,
|
||||
"last_requested_at": now,
|
||||
"crawl_status": "crawling",
|
||||
"crawl_started_at": now,
|
||||
"crawler_id": crawler_id, # 从请求参数获取
|
||||
}
|
||||
# 如果是全新关键词(非续爬),重置分页状态
|
||||
if is_fresh:
|
||||
update_fields["last_completed_page"] = 0
|
||||
update_fields["total_pages"] = 0
|
||||
update_fields["jobs_found"] = 0
|
||||
update_fields["error_message"] = ""
|
||||
update_fields["retry_count"] = 0
|
||||
```
|
||||
|
||||
### Step 3: 新增进度汇报 API
|
||||
|
||||
**修改** `app/api/v1/keyword/keyword.py` — 新增 2 个端点
|
||||
|
||||
#### 3.1 页面进度汇报
|
||||
|
||||
```
|
||||
POST /api/v1/keyword/page-progress
|
||||
Body: {
|
||||
"source": "boss",
|
||||
"keyword_id": 123,
|
||||
"page": 2,
|
||||
"total_pages": 10, // 可选,爬虫发现的总页数
|
||||
"jobs_found": 15 // 本页发现的职位数
|
||||
}
|
||||
```
|
||||
|
||||
控制器逻辑:
|
||||
```python
|
||||
async def report_page_progress(self, source, keyword_id, page, total_pages=None, jobs_found=0):
|
||||
model = self._ensure_model(source)
|
||||
update_data = {"last_completed_page": page}
|
||||
if total_pages is not None and total_pages > 0:
|
||||
update_data["total_pages"] = total_pages
|
||||
# jobs_found 累加
|
||||
await model.filter(id=keyword_id).update(
|
||||
last_completed_page=page,
|
||||
jobs_found=F("jobs_found") + jobs_found,
|
||||
**({"total_pages": total_pages} if total_pages else {})
|
||||
)
|
||||
```
|
||||
|
||||
#### 3.2 爬取完成/失败汇报
|
||||
|
||||
```
|
||||
POST /api/v1/keyword/crawl-complete
|
||||
Body: {
|
||||
"source": "boss",
|
||||
"keyword_id": 123,
|
||||
"status": "completed" | "failed",
|
||||
"error_message": "optional error detail"
|
||||
}
|
||||
```
|
||||
|
||||
控制器逻辑:
|
||||
```python
|
||||
async def report_crawl_complete(self, source, keyword_id, status, error_message=""):
|
||||
model = self._ensure_model(source)
|
||||
update_data = {"crawl_status": status, "error_message": error_message}
|
||||
if status == "failed":
|
||||
# 使用 F 表达式原子递增 retry_count
|
||||
await model.filter(id=keyword_id).update(
|
||||
crawl_status="failed",
|
||||
error_message=error_message,
|
||||
)
|
||||
# retry_count 单独递增
|
||||
obj = await model.filter(id=keyword_id).first()
|
||||
if obj:
|
||||
obj.retry_count += 1
|
||||
await obj.save(update_fields=["retry_count"])
|
||||
else:
|
||||
await model.filter(id=keyword_id).update(**update_data)
|
||||
```
|
||||
|
||||
### Step 4: 新增请求 Schema
|
||||
|
||||
**修改** `app/schemas/keyword.py`(或新建)
|
||||
|
||||
```python
|
||||
class PageProgressRequest(BaseModel):
|
||||
source: str
|
||||
keyword_id: int
|
||||
page: int
|
||||
total_pages: Optional[int] = None
|
||||
jobs_found: int = 0
|
||||
|
||||
class CrawlCompleteRequest(BaseModel):
|
||||
source: str
|
||||
keyword_id: int
|
||||
status: Literal["completed", "failed"]
|
||||
error_message: str = ""
|
||||
```
|
||||
|
||||
### Step 5: 过期爬取检测(定时任务)
|
||||
|
||||
**修改** `app/core/scheduler.py` — 新增 `stale_crawl_cleanup_job`
|
||||
|
||||
```python
|
||||
# 每 10 分钟检查一次
|
||||
async def stale_crawl_cleanup():
|
||||
"""将超过 30 分钟仍为 crawling 状态的关键词降级为 partial"""
|
||||
threshold = datetime.now() - timedelta(minutes=30)
|
||||
for model in [BossKeyword, QcwyKeyword, ZhilianKeyword]:
|
||||
count = await model.filter(
|
||||
crawl_status="crawling",
|
||||
crawl_started_at__lt=threshold,
|
||||
).update(crawl_status="partial")
|
||||
if count:
|
||||
logger.info(f"{model.__name__}: {count} 条僵死爬取任务已标记为 partial")
|
||||
```
|
||||
|
||||
### Step 6: 修改爬虫 — Boss 直聘
|
||||
|
||||
**修改** `jobs_spider/boss/boos_api.py`
|
||||
|
||||
#### 6.1 增强 fetch_service_params()
|
||||
|
||||
```python
|
||||
def fetch_service_params() -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
url = f"{API_BASE_URL}/api/v1/keyword/available"
|
||||
crawler_id = f"boss-{os.getpid()}-{os.getenv('HOSTNAME', 'local')}"
|
||||
r = requests.get(url, params={
|
||||
"source": "boss", "limit": 1, "reserve": True,
|
||||
"crawler_id": crawler_id, # 新增
|
||||
}, timeout=10)
|
||||
# ... 解析逻辑 ...
|
||||
item = items[0]
|
||||
# 不再需要 mark-used(get_available 已原子标记)
|
||||
return {
|
||||
"query": item["job"],
|
||||
"city": item["city"],
|
||||
"scene": 1,
|
||||
"page": item.get("last_completed_page", 0) + 1, # 断点续爬
|
||||
"keyword_id": item["id"], # 新增
|
||||
}
|
||||
except Exception:
|
||||
return None
|
||||
```
|
||||
|
||||
#### 6.2 主循环添加进度汇报
|
||||
|
||||
```python
|
||||
# 在 get_job_list_multi_pages 的每页完成后回调中:
|
||||
def on_page_complete(page_num, jobs_count, keyword_id):
|
||||
try:
|
||||
requests.post(f"{API_BASE_URL}/api/v1/keyword/page-progress", json={
|
||||
"source": "boss",
|
||||
"keyword_id": keyword_id,
|
||||
"page": page_num,
|
||||
"jobs_found": jobs_count,
|
||||
}, timeout=5)
|
||||
except Exception:
|
||||
pass # 汇报失败不影响主流程
|
||||
|
||||
# 全部完成后:
|
||||
def on_crawl_done(keyword_id, success, error_msg=""):
|
||||
try:
|
||||
requests.post(f"{API_BASE_URL}/api/v1/keyword/crawl-complete", json={
|
||||
"source": "boss",
|
||||
"keyword_id": keyword_id,
|
||||
"status": "completed" if success else "failed",
|
||||
"error_message": error_msg,
|
||||
}, timeout=5)
|
||||
except Exception:
|
||||
pass
|
||||
```
|
||||
|
||||
### Step 7: 修改爬虫 — 前程无忧
|
||||
|
||||
**修改** `jobs_spider/qcwy/qcwy.py`
|
||||
|
||||
同 Boss 结构:
|
||||
- `fetch_service_params()` 增加 `keyword_id` 和 `last_completed_page` 返回
|
||||
- `crawl_multiple_pages()` 的 `start_page` 从 `last_completed_page + 1` 开始
|
||||
- 每页完成后调用 `page-progress` API
|
||||
- 全部完成/失败后调用 `crawl-complete` API
|
||||
|
||||
### Step 8: 修改爬虫 — 智联招聘
|
||||
|
||||
**修改** `jobs_spider/zhilian/zhilian_single.py`
|
||||
|
||||
同 Boss 结构:
|
||||
- `fetch_service_params()` 增加 `keyword_id` 和 `last_completed_page` 返回
|
||||
- `crawl_pc()` 的起始页从 `last_completed_page + 1` 开始
|
||||
- 每页完成后调用 `page-progress` API
|
||||
- 全部完成/失败后调用 `crawl-complete` API
|
||||
|
||||
### Step 9: 统计接口增强
|
||||
|
||||
**修改** `app/controllers/keyword.py` 的 `get_stats()`
|
||||
|
||||
返回值增加爬取状态分布:
|
||||
|
||||
```python
|
||||
async def get_stats(self, source, on_date=None):
|
||||
# ... 现有逻辑 ...
|
||||
# 新增状态分布
|
||||
crawling = await model.filter(crawl_status="crawling", last_requested_date=d).count()
|
||||
completed = await model.filter(crawl_status="completed", last_requested_date=d).count()
|
||||
failed = await model.filter(crawl_status="failed", last_requested_date=d).count()
|
||||
partial = await model.filter(crawl_status="partial", last_requested_date=d).count()
|
||||
|
||||
return {
|
||||
"data": {
|
||||
"date": str(d), "total": total, "used": used, "unused": unused,
|
||||
"crawl_status": {
|
||||
"crawling": crawling,
|
||||
"completed": completed,
|
||||
"failed": failed,
|
||||
"partial": partial,
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Step 10: 数据库迁移
|
||||
|
||||
执行 Aerich 迁移以在 MySQL keyword 表中添加新字段:
|
||||
|
||||
```bash
|
||||
aerich migrate --name add_crawl_state_fields
|
||||
aerich upgrade
|
||||
```
|
||||
|
||||
或在 `init_app.py` 的自动迁移中由 Aerich 自动处理(`RUN_MIGRATIONS_ON_STARTUP=True`)。
|
||||
|
||||
---
|
||||
|
||||
## 关键文件清单
|
||||
|
||||
| 文件 | 操作 | 说明 |
|
||||
|------|------|------|
|
||||
| `app/models/keyword.py` | 修改 | 添加 8 个爬取状态字段 |
|
||||
| `app/controllers/keyword.py` | 修改 | 重写 get_available() 优先级调度 + 新增 2 个方法 |
|
||||
| `app/api/v1/keyword/keyword.py` | 修改 | 新增 page-progress / crawl-complete 端点 |
|
||||
| `app/schemas/keyword.py` | 新建 | PageProgressRequest / CrawlCompleteRequest |
|
||||
| `app/core/scheduler.py` | 修改 | 新增 stale_crawl_cleanup 定时任务 |
|
||||
| `jobs_spider/boss/boos_api.py` | 修改 | 断点续爬 + 进度汇报 |
|
||||
| `jobs_spider/qcwy/qcwy.py` | 修改 | 断点续爬 + 进度汇报 |
|
||||
| `jobs_spider/zhilian/zhilian_single.py` | 修改 | 断点续爬 + 进度汇报 |
|
||||
|
||||
---
|
||||
|
||||
## 效果
|
||||
|
||||
| 指标 | 改造前 | 改造后 |
|
||||
|------|--------|--------|
|
||||
| 崩溃恢复 | 关键词丢失,当天不可恢复 | 自动从断点续爬 |
|
||||
| 页面重复爬取 | 100%(整个关键词重爬) | 0%(精确到页级别) |
|
||||
| 僵死任务检测 | 无 | 30 分钟自动降级 |
|
||||
| 失败重试 | 无(关键词当天报废) | 最多 3 次自动重试 |
|
||||
| 爬取进度可见性 | 无 | 实时可查(stats API) |
|
||||
|
||||
## 风险与缓解
|
||||
|
||||
| 风险 | 缓解措施 |
|
||||
|------|----------|
|
||||
| 字段迁移影响现有数据 | 所有新字段都有 default 值,迁移无破坏性 |
|
||||
| 进度汇报增加 API 压力 | 汇报请求轻量(仅 UPDATE 单行),每页仅 1 次,可设 timeout=5s |
|
||||
| 爬虫不升级导致状态不一致 | 新旧爬虫可共存:旧爬虫不汇报进度,关键词仍按 date 逻辑工作 |
|
||||
| retry_count 无上限 | 硬限 3 次,超过 3 次的 failed 不再自动重试 |
|
||||
|
||||
## 验证方式
|
||||
|
||||
1. 启动应用,确认 Aerich 自动迁移新增字段成功
|
||||
2. 手动调用 `GET /api/v1/keyword/available?source=boss` 验证返回 `last_completed_page` 和 `crawl_status`
|
||||
3. 模拟断点:手动设置某关键词 `crawl_status=partial, last_completed_page=2`,再次 `get_available` 应优先返回该关键词
|
||||
4. 启动 Boss 爬虫,观察日志确认从 `last_completed_page + 1` 开始
|
||||
5. 强制 kill 爬虫,等待 30 分钟后确认 `stale_crawl_cleanup` 将状态降级为 `partial`
|
||||
6. 重启爬虫,确认自动续爬
|
||||
317
.claude/plan/project-optimization.md
Normal file
317
.claude/plan/project-optimization.md
Normal file
@ -0,0 +1,317 @@
|
||||
# 📋 实施计划:项目功能修复与优化
|
||||
|
||||
> 生成时间:2026-03-20
|
||||
> 工作目录:/Users/win/2025/AICoding/JobData
|
||||
|
||||
---
|
||||
|
||||
## 一、问题全景(为什么很多功能用不了)
|
||||
|
||||
经过深度代码审查,共发现 **22 个问题**,其中多个问题会直接导致功能完全不可用。以下按功能模块分组。
|
||||
|
||||
---
|
||||
|
||||
### 🔴 数据清洗功能(6 个问题 → 功能基本不可用)
|
||||
|
||||
| # | 严重度 | 文件 | 行号 | 问题 | 影响 |
|
||||
|---|--------|------|------|------|------|
|
||||
| C1 | **严重** | `services/cleaning.py` | 13 | `from jobs_spider.qcwy.search_company_jobs import _extract_items` 引入私有函数,若该文件/函数不存在则整个模块 `ImportError`,**所有清洗 API 直接 500** | 全部清洗功能不可用 |
|
||||
| C2 | **严重** | `services/cleaning.py` | 多处 | 所有爬虫调用(`boss_service.get_job_detail_by_id` 等)是同步 HTTP 阻塞调用,在 `async def` 中直接执行,**阻塞整个事件循环** | 高并发时应用无响应 |
|
||||
| C3 | **高** | `services/cleaning.py` | 28-36 | Boss Token 加载后 `_boss_token_loaded = True` 永不刷新,Token 过期后 Boss 清洗**静默失败** | Boss 平台清洗失效 |
|
||||
| C4 | **高** | `api/v1/cleaning/cleaning.py` | 285-308 | `process_task` API 无超时保护,爬虫卡住则 HTTP 连接永久挂起 | 客户端超时 |
|
||||
| C5 | **高** | **前端缺失** | — | 后端菜单注册了 `/cleaning/index` 和 `/cleaning/monitor`,但 `web/src/views/` 下**不存在对应组件文件** | 点菜单白屏/404 |
|
||||
| C6 | **中** | `api/v1/cleaning/cleaning.py` | 71-75 | `source`/`status` 直接拼入 ClickHouse SQL,SQL 注入风险 | 安全漏洞 |
|
||||
|
||||
---
|
||||
|
||||
### 🔴 定时任务功能(6 个问题 → 任务可能永久跳过)
|
||||
|
||||
| # | 严重度 | 文件 | 行号 | 问题 | 影响 |
|
||||
|---|--------|------|------|------|------|
|
||||
| S1 | **严重** | `core/locks.py` | 43 | 文件锁用 `os.mkdir` 实现,**无 TTL 过期机制**,Worker 崩溃后锁目录永久残留,该任务**永久跳过** | 任务永久失效 |
|
||||
| S2 | **严重** | `core/locks.py` | 38 | 异步函数中使用同步 `redis.Redis`,**阻塞事件循环** | 全局性能问题 |
|
||||
| S3 | **高** | `core/init_app.py` | — | 启动锁 `.startup_lock` 同样无 TTL,崩溃后**迁移和种子数据初始化永不再执行** | 启动异常 |
|
||||
| S4 | **高** | `core/locks.py` | 17 | 锁文件路径为**相对路径** `.lock_xxx`,多 Worker 以不同 CWD 启动时锁完全失效 | 任务并发执行 |
|
||||
| S5 | **中** | `core/scheduler.py` | — | `stats_job` 与 `ecs_full_pipeline_job` 调度时间完全重合(`*/6h`),同时执行压力大 | 资源竞争 |
|
||||
| S6 | **中** | `core/scheduler.py` | 181 | `company_cleaning_job` 处理 30 个公司可能超过 5 分钟调度间隔,任务堆积被 skip | 清洗停滞 |
|
||||
|
||||
---
|
||||
|
||||
### 🔴 安全问题(4 个 → 凭据泄漏)
|
||||
|
||||
| # | 严重度 | 文件 | 行号 | 问题 |
|
||||
|---|--------|------|------|------|
|
||||
| X1 | **严重** | `ecs_full_pipeline.py` | 487-488 | 阿里云 AK/SK 硬编码在代码中,已在 git 历史里 |
|
||||
| X2 | **严重** | `settings/config.py` | 44-52 | MySQL root 密码、SMTP 授权码、ClickHouse 密码硬编码 |
|
||||
| X3 | **严重** | `services/job.py` | 535 | 第三方 API 签名 salt 硬编码 |
|
||||
| X4 | **严重** | `core/dependency.py` | 26-28 | `token == "dev"` 开发后门在生产环境同样有效 |
|
||||
|
||||
---
|
||||
|
||||
### 🟡 IP 告警功能(3 个问题)
|
||||
|
||||
| # | 严重度 | 文件 | 行号 | 问题 |
|
||||
|---|--------|------|------|------|
|
||||
| I1 | **中** | `core/scheduler.py` | 273 | 邮件模板用 `a.get('date')` 但实际字段是 `last_report_at`,告警日期列**永远为空** |
|
||||
| I2 | **中** | `core/ip_tracking.py` | — | 中间件读 `response.body` 对流式响应无效,IP 计数不准 |
|
||||
| I3 | **低** | `core/ip_tracking.py` | 73 | `save()` 未指定 `update_fields`,并发写存在竞态 |
|
||||
|
||||
---
|
||||
|
||||
### 🟡 分析功能(2 个问题)
|
||||
|
||||
| # | 严重度 | 文件 | 行号 | 问题 |
|
||||
|---|--------|------|------|------|
|
||||
| A1 | **高** | `api/v1/analytics.py` | — | `backports.zoneinfo` 未在 Pipfile 中声明,若 Python 3.8 则 `ImportError`,整个分析路由挂 |
|
||||
| A2 | **低** | `api/v1/analytics.py` | — | `Query(regex=...)` 在 Pydantic v2 已弃用,应改 `pattern` |
|
||||
|
||||
---
|
||||
|
||||
### 🟡 Ruff 报告的代码缺陷(之前已诊断,此处不重复)
|
||||
|
||||
共 34 个 lint 错误,其中 3 个 F821(未定义变量 `udt`/`fpt`/`json`)会导致运行时崩溃。
|
||||
|
||||
---
|
||||
|
||||
## 二、实施步骤(按优先级排序)
|
||||
|
||||
### Phase 1:修复致命问题(功能完全不可用)
|
||||
|
||||
#### 1.1 修复文件锁 — 添加 TTL 过期机制
|
||||
|
||||
**文件**:`app/core/locks.py`
|
||||
|
||||
```python
|
||||
# 修改 _try_file_lock 方法
|
||||
# 在 acquire 时写入时间戳到锁目录内的文件
|
||||
# 在 acquire 失败时检查时间戳,若超过 TTL 则强制删除旧锁
|
||||
|
||||
async def acquire(self) -> bool:
|
||||
# Redis 路径不变
|
||||
if self._redis:
|
||||
return bool(self._redis.set(self._key, "locked", nx=True, ex=self.ttl))
|
||||
# 文件锁路径:改用绝对路径 + TTL 检查
|
||||
lock_dir = Path(tempfile.gettempdir()) / f"jobdata_lock_{self.name}"
|
||||
lock_meta = lock_dir / "meta"
|
||||
try:
|
||||
lock_dir.mkdir()
|
||||
lock_meta.write_text(str(time.time()))
|
||||
return True
|
||||
except FileExistsError:
|
||||
# 检查是否过期
|
||||
if lock_meta.exists():
|
||||
created = float(lock_meta.read_text())
|
||||
if time.time() - created > self.ttl:
|
||||
shutil.rmtree(lock_dir) # 强制清理过期锁
|
||||
return await self.acquire() # 重试
|
||||
return False
|
||||
```
|
||||
|
||||
#### 1.2 修复 Redis 同步阻塞 → 异步
|
||||
|
||||
**文件**:`app/core/locks.py`
|
||||
|
||||
将 `redis.Redis` 替换为 `redis.asyncio.Redis`,所有 `self._redis.set/get/delete` 改为 `await self._redis.set/get/delete`。
|
||||
|
||||
#### 1.3 修复清洗模块 ImportError 风险
|
||||
|
||||
**文件**:`app/services/cleaning.py:13`
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
|
||||
|
||||
# 修改后:安全导入 + 降级
|
||||
try:
|
||||
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
|
||||
except ImportError:
|
||||
logger.warning("qcwy search_company_jobs 模块不可用,公司职位提取功能降级")
|
||||
qcwy_extract_items = None
|
||||
```
|
||||
|
||||
#### 1.4 修复清洗中同步阻塞调用
|
||||
|
||||
**文件**:`app/services/cleaning.py` 多处
|
||||
|
||||
将所有同步爬虫调用包装为 `asyncio.to_thread`:
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
data = self.boss_service.get_job_detail_by_id(target)
|
||||
|
||||
# 修改后
|
||||
data = await asyncio.to_thread(self.boss_service.get_job_detail_by_id, target)
|
||||
```
|
||||
|
||||
涉及的方法:`clean_by_job_id`、`clean_by_company_name`、`clean_boss_company_jobs`、`clean_qcwy_company_jobs`、`clean_zhilian_company_jobs`(共约 12 处调用)。
|
||||
|
||||
`app/services/company_cleaner.py` 中同样的模式也需要修复(同步爬虫调用包装为 `to_thread`)。
|
||||
|
||||
#### 1.5 修复 Boss Token 永久缓存问题
|
||||
|
||||
**文件**:`app/services/cleaning.py:28-36`,`app/services/company_cleaner.py:28-36`
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
async def _ensure_boss_token_loaded(self) -> None:
|
||||
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
|
||||
return # 永不刷新
|
||||
|
||||
# 修改后:添加过期时间检查
|
||||
async def _ensure_boss_token_loaded(self) -> None:
|
||||
now = time.time()
|
||||
if (self._boss_token_loaded
|
||||
and self.boss_service.login_data.get("mpt")
|
||||
and now - self._token_loaded_at < 3600): # 1小时刷新一次
|
||||
return
|
||||
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
||||
if token_obj:
|
||||
self.boss_service.login_data["mpt"] = token_obj.mpt_value
|
||||
self._boss_token_loaded = True
|
||||
self._token_loaded_at = now
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Phase 2:修复安全问题
|
||||
|
||||
#### 2.1 凭据迁移(config.py)
|
||||
|
||||
创建 `.env.example` + 修改 `config.py` 用 `pydantic-settings` 从环境变量读取(详见之前的 ruff-optimization 计划)。
|
||||
|
||||
#### 2.2 移除 dev 后门
|
||||
|
||||
**文件**:`app/core/dependency.py:26-28`
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
if token == "dev":
|
||||
user = await User.filter().first()
|
||||
return user
|
||||
|
||||
# 修改后:仅在开发环境允许
|
||||
import os
|
||||
if token == "dev" and os.getenv("APP_ENV", "production") == "development":
|
||||
user = await User.filter().first()
|
||||
return user
|
||||
```
|
||||
|
||||
#### 2.3 阿里云 AK/SK 移入环境变量
|
||||
|
||||
**文件**:`ecs_full_pipeline.py:487-488`
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
ak = "LTAI5tBgW3hAzcnHBkZywxkD"
|
||||
sk = "Il7M4bkJvdZIutkJH8pxhuMLrMvj5x"
|
||||
|
||||
# 修改后
|
||||
ak = os.environ["ALIBABA_CLOUD_ACCESS_KEY_ID"]
|
||||
sk = os.environ["ALIBABA_CLOUD_ACCESS_KEY_SECRET"]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Phase 3:修复 IP 告警和分析功能
|
||||
|
||||
#### 3.1 修复邮件模板字段名
|
||||
|
||||
**文件**:`app/core/scheduler.py:273`
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
f"<td>{a.get('date')}</td>"
|
||||
|
||||
# 修改后
|
||||
f"<td>{a.get('last_report_at', 'N/A')}</td>"
|
||||
```
|
||||
|
||||
#### 3.2 修复 analytics Query 参数弃用警告
|
||||
|
||||
**文件**:`app/api/v1/analytics.py`
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
interval: str = Query("day", regex="^(day|hour|week|month)$")
|
||||
|
||||
# 修改后
|
||||
interval: str = Query("day", pattern="^(day|hour|week|month)$")
|
||||
```
|
||||
|
||||
#### 3.3 修复 zoneinfo 导入
|
||||
|
||||
确认 Python 版本为 3.13(项目 Pipfile 声明),`zoneinfo` 是标准库,无需 `backports`。可直接删除 try/except,只保留 `from zoneinfo import ZoneInfo`。
|
||||
|
||||
---
|
||||
|
||||
### Phase 4:修复 Ruff 34 个 lint 错误
|
||||
|
||||
```bash
|
||||
# 自动修复 22 个
|
||||
pipenv run ruff check app/ --fix
|
||||
|
||||
# 手动修复剩余 12 个(F821 × 3、E722 × 1、E402 × 5、其他 × 3)
|
||||
```
|
||||
|
||||
F821 重点修复:
|
||||
- `job.py:348` — `udt` 未定义(需确认应为 `update_date_time`)
|
||||
- `job.py:374` — `fpt` 未定义(需确认应为 `first_publish_time`)
|
||||
- `crawler/zhilian.py:60` — 添加 `import json`
|
||||
|
||||
---
|
||||
|
||||
### Phase 5:代码去重和可维护性优化
|
||||
|
||||
1. 合并 `job.py` 中 7 个重复的 `_check_*_duplicate` 为 1 个通用方法
|
||||
2. 删除死代码 `_check_qcwy_company_duplicate_by_name`
|
||||
3. 将 `job.py` 中 `requests.post` 替换为 `httpx.AsyncClient`
|
||||
4. 错调度时间:将 `ecs_full_pipeline_job` 偏移 30 分钟,避免与 `stats_job` 重合
|
||||
|
||||
---
|
||||
|
||||
## 三、关键文件索引
|
||||
|
||||
| 文件 | 操作 | Phase | 说明 |
|
||||
|------|------|-------|------|
|
||||
| `app/core/locks.py` | 重构 | 1 | 文件锁 TTL + Redis 异步化 |
|
||||
| `app/services/cleaning.py` | 修复 | 1 | ImportError 防护 + async 阻塞 + Token 刷新 |
|
||||
| `app/services/company_cleaner.py` | 修复 | 1 | async 阻塞 + Token 刷新 |
|
||||
| `app/core/dependency.py` | 修复 | 2 | dev 后门加环境判断 |
|
||||
| `app/settings/config.py` | 重构 | 2 | 凭据移入环境变量 |
|
||||
| `ecs_full_pipeline.py` | 修复 | 2 | AK/SK 移入环境变量 |
|
||||
| `app/core/scheduler.py` | 修复 | 3 | 邮件字段名 + 调度时间偏移 |
|
||||
| `app/api/v1/analytics.py` | 修复 | 3 | regex→pattern + zoneinfo |
|
||||
| `app/services/job.py` | 修复 | 4+5 | F821 + E722 + requests→httpx + 去重方法合并 |
|
||||
| `app/services/crawler/zhilian.py` | 修复 | 4 | 添加 import json |
|
||||
| `web/src/views/cleaning/` | 新建 | 5 | 创建前端清洗页面组件(可选) |
|
||||
| `.env.example` | 新建 | 2 | 环境变量模板 |
|
||||
|
||||
---
|
||||
|
||||
## 四、风险与缓解
|
||||
|
||||
| 风险 | 缓解措施 |
|
||||
|------|----------|
|
||||
| 文件锁改造后旧锁目录残留 | 部署时手动清理 `.lock_*` 目录 |
|
||||
| Redis 异步化后连接池配置不同 | 保持相同连接参数,仅换客户端类 |
|
||||
| `asyncio.to_thread` 增加线程池压力 | 设置 `max_workers=10` 限制并发 |
|
||||
| 凭据迁移后服务启动失败 | 先创建 `.env` 文件再部署 |
|
||||
| 前端清洗页面组件工作量大 | 可先做最小 MVP(列表 + 手动触发) |
|
||||
|
||||
---
|
||||
|
||||
## 五、执行顺序
|
||||
|
||||
```
|
||||
Phase 1(2h) → 修复致命问题:锁机制 + 清洗模块 + async 阻塞
|
||||
Phase 2(1h) → 安全问题:凭据迁移 + dev 后门
|
||||
Phase 3(30m) → IP 告警 + 分析功能修复
|
||||
Phase 4(30m) → Ruff 34 个 lint 错误
|
||||
Phase 5(2h) → 代码去重 + 前端组件(可选)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## SESSION_ID(供 /ccg:execute 使用)
|
||||
- CODEX_SESSION: N/A(本次分析由 Claude 本地执行)
|
||||
- GEMINI_SESSION: N/A
|
||||
213
.claude/plan/ruff-optimization.md
Normal file
213
.claude/plan/ruff-optimization.md
Normal file
@ -0,0 +1,213 @@
|
||||
# 📋 实施计划:Ruff 代码优化 + 项目质量提升
|
||||
|
||||
> 生成时间:2026-03-20
|
||||
> 工作目录:/Users/win/2025/AICoding/JobData
|
||||
|
||||
---
|
||||
|
||||
## 一、现状问题总览
|
||||
|
||||
### 1.1 Ruff 实际扫描结果(34 个错误,22 个可自动修复)
|
||||
|
||||
| 规则 | 数量 | 说明 | 可自动修复 |
|
||||
|------|------|------|:---:|
|
||||
| `F401` | 20 | 未使用的 import | ✅ |
|
||||
| `E402` | 5 | import 不在文件顶部 | ❌ |
|
||||
| `F541` | 3 | f-string 没有占位符 | ✅ |
|
||||
| `F821` | 3 | 引用了未定义的变量名 | ❌ |
|
||||
| `F811` | 2 | 重复定义(导入后又被覆盖) | ✅ |
|
||||
| `E722` | 1 | 裸 `except:`(不捕获具体异常) | ❌ |
|
||||
| **合计** | **34** | | **22 可自动修复** |
|
||||
|
||||
### 1.2 受影响文件清单
|
||||
|
||||
| 文件 | 问题数 | 最严重问题 |
|
||||
|------|--------|-----------|
|
||||
| `app/api/v1/token/token.py` | 8 | E402 + F811(import 顺序混乱,重复定义) |
|
||||
| `app/services/job.py` | 3 | **F821 未定义变量** `udt`、`fpt`;**E722 裸 except** |
|
||||
| `app/services/crawler/zhilian.py` | 1 | **F821 未定义变量** `json` |
|
||||
| `app/services/company_cleaner.py` | 3 | F541 空 f-string |
|
||||
| `app/services/crawler/__init__.py` | 3 | F401 无效的服务导出 |
|
||||
| `app/repositories/clickhouse_repo.py` | 2 | F401 `math`, `Generator` |
|
||||
| `app/schemas/token.py` | 2 | F401 `Dict`, `Any` |
|
||||
| `app/controllers/job.py` | 1 | F401 `Optional` |
|
||||
| `app/controllers/keyword.py` | 1 | F401 `CRUDBase` |
|
||||
| `app/core/algorithms/antispider.py` | 1 | F401 `os` |
|
||||
| `app/core/ip_tracking.py` | 1 | F401 `Any` |
|
||||
| `app/core/locks.py` | 1 | F401 `time` |
|
||||
| `app/api/v1/analytics.py` | 1 | F401 `List` |
|
||||
| `app/api/v1/ingest/ingest.py` | 1 | F401 `Optional` |
|
||||
| `app/schemas/analytics.py` | 1 | F401 `Any` |
|
||||
| `app/services/crawler/boss.py` | 1 | F401 `os` |
|
||||
|
||||
---
|
||||
|
||||
### 1.3 Ruff 扫描之外的深层问题(代码审查发现)
|
||||
|
||||
#### 🔴 CRITICAL — 安全问题(硬编码凭据)
|
||||
|
||||
| 文件 | 行号 | 问题 |
|
||||
|------|------|------|
|
||||
| `app/settings/config.py` | ~23 | `SECRET_KEY = "CHANGE_ME_DEV_ONLY"` JWT 密钥 |
|
||||
| `app/settings/config.py` | ~27-30 | ClickHouse 主机 IP、用户名、密码明文 |
|
||||
| `app/settings/config.py` | ~44-45 | SMTP 真实邮箱账号 + 授权码明文 |
|
||||
| `app/settings/config.py` | ~52 | MySQL root 密码 + 生产 IP 硬编码在连接串 |
|
||||
| `app/services/job.py` | ~533-535 | 外部 API salt 硬编码 |
|
||||
|
||||
#### 🔴 HIGH — 性能问题(事件循环阻塞)
|
||||
|
||||
| 文件 | 行号 | 问题 |
|
||||
|------|------|------|
|
||||
| `app/services/job.py` | ~547 | `async def` 中调用同步 `requests.post` 阻塞事件循环 |
|
||||
| `app/services/job.py` | ~926-933 | 串行逐条发送远程推送,N 条数据 = N 次串行阻塞 |
|
||||
| `app/core/locks.py` | ~38 | 同步 `redis.Redis` 在 `async` 方法中调用,阻塞事件循环 |
|
||||
|
||||
#### 🟡 MEDIUM — 代码质量
|
||||
|
||||
| 文件 | 问题 |
|
||||
|------|------|
|
||||
| `app/services/job.py` | 7 个 `_check_*_duplicate` 方法几乎完全重复,仅 SQL 参数不同 |
|
||||
| `app/services/job.py` | 1 个死代码方法:`_check_qcwy_company_duplicate_by_name` 从未被调用 |
|
||||
| `app/repositories/clickhouse_repo.py` | `group_by_column` 直接拼入 SQL(潜在 SQL 注入) |
|
||||
| `app/api/v1/__init__.py` | 同一 router 注册两次(`/job` 和 `/universal`),OpenAPI 文档重复 |
|
||||
| 全项目 | 零测试文件,关键业务逻辑(去重、路由分发)无任何测试保护 |
|
||||
|
||||
---
|
||||
|
||||
## 二、实施步骤
|
||||
|
||||
### Phase 1:Ruff 自动修复(低风险,5 分钟)
|
||||
|
||||
```bash
|
||||
# 自动修复 22 个可自动修复的问题
|
||||
pipenv run ruff check app/ --fix
|
||||
|
||||
# 验证修复结果
|
||||
pipenv run ruff check app/ --statistics
|
||||
```
|
||||
|
||||
自动修复覆盖:F401(未使用 import)、F541(空 f-string)、F811(重复定义)
|
||||
|
||||
### Phase 2:手动修复 Ruff 报告的无法自动修复问题(12 个)
|
||||
|
||||
#### 2.1 F821 未定义变量(CRITICAL,会导致运行时崩溃)
|
||||
|
||||
**`app/services/job.py:348`** — 变量 `udt` 未定义
|
||||
|
||||
需要读取上下文,确认 `udt` 应该是什么(可能是 `update_date_time` 的缩写或某个局部变量)。
|
||||
|
||||
**`app/services/job.py:374`** — 变量 `fpt` 未定义
|
||||
|
||||
需要读取上下文,确认 `fpt` 应该是什么(可能是 `first_publish_time` 缩写)。
|
||||
|
||||
**`app/services/crawler/zhilian.py:60`** — `json` 模块未导入但被使用
|
||||
|
||||
修复:在文件顶部添加 `import json`。
|
||||
|
||||
#### 2.2 E722 裸 except(`app/services/job.py:302`)
|
||||
|
||||
```python
|
||||
# 修改前
|
||||
except:
|
||||
pass
|
||||
|
||||
# 修改后
|
||||
except Exception as e:
|
||||
logger.error(f"处理失败: {e}")
|
||||
```
|
||||
|
||||
#### 2.3 E402 import 不在顶部(`app/api/v1/token/token.py:92-96`)
|
||||
|
||||
将条件式 import 移至文件顶部,或使用 `TYPE_CHECKING` 保护块。
|
||||
|
||||
### Phase 3:凭据安全(CRITICAL,建议本次一并完成)
|
||||
|
||||
**目标**:将所有硬编码凭据移入环境变量
|
||||
|
||||
1. 在项目根目录创建 `.env.example`(安全模板)
|
||||
2. 修改 `app/settings/config.py`,用 `pydantic-settings` 从环境变量读取所有敏感值
|
||||
3. 启动时校验必填环境变量,缺失则报错退出(Fail Fast)
|
||||
4. 将 `.env` 加入 `.gitignore`(已有则确认)
|
||||
|
||||
```python
|
||||
# config.py 改造后示例
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
class Settings(BaseSettings):
|
||||
SECRET_KEY: str # 必填,无默认值
|
||||
CLICKHOUSE_HOST: str = "localhost"
|
||||
CLICKHOUSE_USER: str = "default"
|
||||
CLICKHOUSE_PASS: str # 必填
|
||||
SMTP_USER: str = ""
|
||||
SMTP_PASS: str = ""
|
||||
DB_URL: str # 必填
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
```
|
||||
|
||||
### Phase 4:性能修复(async 阻塞问题)
|
||||
|
||||
1. 将 `app/services/job.py` 中的 `requests.post` 替换为 `httpx.AsyncClient`(已在依赖中)
|
||||
2. 将 `_batch_send_to_remote_server` 改为 `asyncio.gather` 并发执行
|
||||
3. 将 `app/core/locks.py` 中的同步 `redis.Redis` 替换为 `aioredis`(或 `redis.asyncio`)
|
||||
|
||||
### Phase 5:代码去重(可维护性)
|
||||
|
||||
合并 7 个重复的 `_check_*_duplicate` 方法为 1 个通用方法:
|
||||
|
||||
```python
|
||||
async def _check_duplicate(
|
||||
self,
|
||||
table: str,
|
||||
conditions: dict[str, str], # {"column_name": "value"}
|
||||
days: int = 90
|
||||
) -> bool:
|
||||
...
|
||||
```
|
||||
|
||||
删除死代码:`_check_qcwy_company_duplicate_by_name`
|
||||
|
||||
---
|
||||
|
||||
## 三、关键文件索引
|
||||
|
||||
| 文件 | 操作 | 说明 |
|
||||
|------|------|------|
|
||||
| `app/settings/config.py` | 重构 | 凭据移入环境变量 |
|
||||
| `app/services/job.py` | 修复 | F821、E722、async 阻塞、方法去重 |
|
||||
| `app/services/crawler/zhilian.py` | 修复 | 添加 `import json` |
|
||||
| `app/api/v1/token/token.py` | 整理 | 修复 E402 import 顺序 |
|
||||
| `app/services/company_cleaner.py` | 自动修复 | F541 空 f-string |
|
||||
| `app/core/locks.py` | 修复 | 同步 redis → 异步 |
|
||||
| `app/repositories/clickhouse_repo.py` | 修复 | 删除未用 import |
|
||||
| `.env.example` | 新建 | 环境变量模板 |
|
||||
|
||||
---
|
||||
|
||||
## 四、风险与缓解
|
||||
|
||||
| 风险 | 缓解措施 |
|
||||
|------|----------|
|
||||
| 修复 F821 时误判变量用途 | 先读原函数完整逻辑再修复 |
|
||||
| 凭据迁移后服务无法启动 | 先创建 `.env` 再重启服务 |
|
||||
| async 改造引入新 bug | 修改后在本地运行完整功能测试 |
|
||||
| 方法合并破坏去重逻辑 | 保持原有 SQL 逻辑不变,只提取公共参数 |
|
||||
|
||||
---
|
||||
|
||||
## 五、执行顺序建议
|
||||
|
||||
```
|
||||
Phase 1(5min) → pipenv run ruff check app/ --fix
|
||||
Phase 2(30min) → 手动修复 F821 × 3、E722 × 1、E402 × 5
|
||||
Phase 3(60min) → 凭据安全迁移(需配合运维创建 .env)
|
||||
Phase 4(90min) → async 阻塞修复(requests → httpx)
|
||||
Phase 5(60min) → 去重方法合并(可选,不影响功能)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## SESSION_ID(供 /ccg:execute 使用)
|
||||
- CODEX_SESSION: N/A(本次分析由 Claude 本地执行)
|
||||
- GEMINI_SESSION: N/A
|
||||
174
AGENTS.md
Normal file
174
AGENTS.md
Normal file
@ -0,0 +1,174 @@
|
||||
# JobData - 招聘数据采集与统计分析平台
|
||||
|
||||
## 变更记录 (Changelog)
|
||||
|
||||
| 版本 | 日期 | 说明 |
|
||||
|------|------|------|
|
||||
| 初始化 | 2026-03-20 | 首次生成架构文档,覆盖全部四个核心模块 |
|
||||
|
||||
---
|
||||
|
||||
## 项目愿景
|
||||
|
||||
JobData 是一个面向招聘市场的全栈数据采集与分析平台。系统从三大主流招聘平台(Boss 直聘、前程无忧、智联招聘)自动抓取职位与公司数据,统一存储到 ClickHouse 列式数据库,并通过 FastAPI 后端 + Vue3 前端提供数据查看、定向清洗、统计分析等能力。ECS 弹性实例管理模块支持在阿里云上按需批量启停爬虫节点。
|
||||
|
||||
---
|
||||
|
||||
## 架构总览
|
||||
|
||||
```
|
||||
[招聘平台] [爬虫层] [后端 API] [数据库]
|
||||
Boss直聘 ──► jobs_spider/boss/ ──► ┌── MySQL
|
||||
前程无忧 ──► jobs_spider/qcwy/ ──► app (FastAPI) ──► └── ClickHouse
|
||||
智联招聘 ──► jobs_spider/zhilian/ ──►
|
||||
▲
|
||||
│
|
||||
web (Vue3) ────────┘
|
||||
(前端页面)
|
||||
|
||||
ecs_full_pipeline.py ──► 阿里云 ECS ──► 批量启动爬虫节点
|
||||
```
|
||||
|
||||
**核心技术栈**
|
||||
|
||||
| 层次 | 技术 |
|
||||
|------|------|
|
||||
| 后端 | Python 3.13, FastAPI 0.111, Tortoise-ORM 0.23, APScheduler |
|
||||
| 数据库(业务) | MySQL(用户/权限/审计/关键词/Token) |
|
||||
| 数据库(采集) | ClickHouse(职位/公司 JSON 原始数据 + 分析视图) |
|
||||
| 爬虫 | requests / httpx / Playwright(Python 脚本) |
|
||||
| 前端 | Vue 3.3, Vite 4, Naive UI, Pinia, ECharts |
|
||||
| 基础设施 | 阿里云 ECS(按量抢占实例),APScheduler 定时任务 |
|
||||
|
||||
---
|
||||
|
||||
## 模块结构图
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
ROOT["(根) JobData"] --> APP["app - FastAPI 后端"]
|
||||
ROOT --> WEB["web - Vue3 前端"]
|
||||
ROOT --> SPIDER["jobs_spider - 平台爬虫"]
|
||||
ROOT --> ECS["ecs_full_pipeline.py - ECS 批量部署"]
|
||||
|
||||
APP --> APP_API["app/api - 路由层"]
|
||||
APP --> APP_SVC["app/services - 业务逻辑"]
|
||||
APP --> APP_CORE["app/core - 框架核心"]
|
||||
APP --> APP_MODELS["app/models - ORM 模型"]
|
||||
APP --> APP_REPO["app/repositories - 数据仓库"]
|
||||
|
||||
SPIDER --> SP_BOSS["jobs_spider/boss"]
|
||||
SPIDER --> SP_QCWY["jobs_spider/qcwy"]
|
||||
SPIDER --> SP_ZL["jobs_spider/zhilian"]
|
||||
|
||||
click APP "./app/AGENTS.md" "查看 app 模块文档"
|
||||
click WEB "./web/AGENTS.md" "查看 web 模块文档"
|
||||
click SPIDER "./jobs_spider/AGENTS.md" "查看 jobs_spider 模块文档"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 模块索引
|
||||
|
||||
| 模块路径 | 语言 | 职责简述 |
|
||||
|----------|------|----------|
|
||||
| `app/` | Python | FastAPI 后端,提供 REST API、权限管理、定时任务、数据入库与分析 |
|
||||
| `web/` | Vue3/JS | 前端管理界面,数据展示、关键词管理、代理管理、数据清洗操作 |
|
||||
| `jobs_spider/` | Python | 三大平台的爬虫脚本,独立运行,结果通过 HTTP 推送到后端 |
|
||||
| `ecs_full_pipeline.py` | Python | 阿里云 ECS 实例批量创建/销毁/命令下发全流程脚本 |
|
||||
| `reclean_qcwy_jobs.py` | Python | 前程无忧数据重清洗独立脚本 |
|
||||
|
||||
---
|
||||
|
||||
## 运行与开发
|
||||
|
||||
### 后端启动
|
||||
|
||||
```bash
|
||||
# 安装依赖(pipenv)
|
||||
pipenv install
|
||||
|
||||
# 开发模式(默认端口 9999,20 个 worker)
|
||||
python run.py
|
||||
|
||||
# 环境变量覆盖
|
||||
APP_HOST=0.0.0.0 APP_PORT=9999 UVICORN_WORKERS=4 python run.py
|
||||
```
|
||||
|
||||
**关键环境变量**
|
||||
|
||||
| 变量 | 默认值 | 说明 |
|
||||
|------|--------|------|
|
||||
| `APP_HOST` | `0.0.0.0` | 监听地址 |
|
||||
| `APP_PORT` | `9999` | 监听端口 |
|
||||
| `UVICORN_WORKERS` | `20` | Worker 数量 |
|
||||
| `CLICKHOUSE_HOST` | `121.4.126.241` | ClickHouse 地址(需修改为实际地址) |
|
||||
| `CLICKHOUSE_USER` / `CLICKHOUSE_PASS` | 见 config.py | ClickHouse 认证 |
|
||||
| `SMTP_HOST` / `SMTP_USER` / `SMTP_PASS` | 见 config.py | 邮件告警配置 |
|
||||
| `REPORT_ENDPOINT` | 空 | 统计结果 Webhook 上报地址 |
|
||||
| `RUN_MIGRATIONS_ON_STARTUP` | `True` | 是否启动时自动迁移 |
|
||||
| `INITIALIZE_SEED_DATA_ON_STARTUP` | `True` | 是否启动时初始化种子数据 |
|
||||
|
||||
> 安全警告:`config.py` 中 `SECRET_KEY`、数据库连接串、SMTP 密码均为硬编码默认值,生产环境必须通过环境变量覆盖。
|
||||
|
||||
### 前端启动
|
||||
|
||||
```bash
|
||||
cd web
|
||||
pnpm install
|
||||
pnpm dev # 开发模式,默认 http://localhost:5173
|
||||
pnpm build # 构建产物到 web/dist
|
||||
```
|
||||
|
||||
### ECS 批量爬虫部署
|
||||
|
||||
```bash
|
||||
# 需配置阿里云凭据(环境变量或 ~/.alibabacloud/credentials)
|
||||
python ecs_full_pipeline.py
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 定时任务
|
||||
|
||||
APScheduler 在应用启动时注册以下任务(`app/core/scheduler.py`):
|
||||
|
||||
| 任务 ID | 频率 | 职责 |
|
||||
|---------|------|------|
|
||||
| `stats_job` | 每 6 小时 | 统计 ClickHouse 各表总量并通过邮件/Webhook 上报 |
|
||||
| `ecs_full_pipeline` | 每 6 小时 | 调用 `ecs_full_pipeline.py` 批量刷新爬虫节点 |
|
||||
| `ip_alert_job` | 每 10 分钟 | 检查 IP 上报异常并告警 |
|
||||
| `company_cleaning_job` | 每 5 分钟 | 自动清洗待处理公司数据(collect 50 + process 30) |
|
||||
| `daily_cleanup_job` | 每天 00:05 | 清理历史任务运行记录 |
|
||||
|
||||
所有任务通过分布式文件锁(或可选 Redis 锁)保证多 Worker 下只执行一次。
|
||||
|
||||
---
|
||||
|
||||
## 测试策略
|
||||
|
||||
- 当前代码库**无自动化测试文件**(缺口:单元测试、集成测试均缺失)。
|
||||
- 推荐补充:
|
||||
1. `app/services/` 的 service 层单元测试(使用 `pytest` + `anyio`)
|
||||
2. `app/api/v1/` 的 API 集成测试(使用 `httpx.AsyncClient`)
|
||||
3. `jobs_spider/` 的数据解析函数单元测试
|
||||
|
||||
---
|
||||
|
||||
## 编码规范
|
||||
|
||||
- Python:使用 `ruff`(已在 Pipfile 中),格式化用 `black`,排序用 `isort`。
|
||||
- 前端:ESLint(`@zclzone` + `@unocss` 规则集),`prettier` 格式化。
|
||||
- 类型:后端强制 `pydantic` Schema 做入参校验;前端以 JS 为主(未启用严格 TS)。
|
||||
- 日志:后端统一使用 `loguru`,结构化字段 `logger.info(...)` 方式输出。
|
||||
|
||||
---
|
||||
|
||||
## AI 使用指引
|
||||
|
||||
- 修改爬虫逻辑时,重点关注反爬机制:`SmartIPManager`、`IPAnomalyDetector` 在 `jobs_spider/boss/boos_api.py` 中实现,随机延迟至少 10 秒。
|
||||
- 新增 API 路由后需同步在 `app/api/v1/__init__.py` 注册,并执行 `api_controller.refresh_api()` 更新权限表。
|
||||
- ClickHouse 表结构变更在 `app/core/clickhouse_init.py` 中维护,**不走 Aerich 迁移**。
|
||||
- MySQL 模型变更走 Aerich(`aerich migrate && aerich upgrade`)。
|
||||
- 前端新增页面需要在 `web/src/views/{模块}/route.js` 和后端 `init_menus()` 中同步注册菜单。
|
||||
- `config.py` 中已硬编码真实 MySQL/ClickHouse 连接串和 SMTP 凭据,**提交代码前务必确认不泄露敏感信息**。
|
||||
502
Pipfile.lock
generated
502
Pipfile.lock
generated
@ -1,7 +1,7 @@
|
||||
{
|
||||
"_meta": {
|
||||
"hash": {
|
||||
"sha256": "4661b2ece0fd4084c531ca39f080cddb1c3f4924207854b9ebeba24f5f092538"
|
||||
"sha256": "06a87f34157b1b7a382087d3b63ddd42071655bd37056b98510d5937e62ee726"
|
||||
},
|
||||
"pipfile-spec": 6,
|
||||
"requires": {
|
||||
@ -89,6 +89,14 @@
|
||||
"markers": "python_version >= '3.6'",
|
||||
"version": "==21.2.0"
|
||||
},
|
||||
"async-timeout": {
|
||||
"hashes": [
|
||||
"sha256:39e3809566ff85354557ec2398b55e096c8364bacac9405a7a1fa429e77fe76c",
|
||||
"sha256:d9321a7a3d5a6a5e187e824d2fa0793ce379a202935782d555d6e9d2735677d3"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==5.0.1"
|
||||
},
|
||||
"asyncclick": {
|
||||
"hashes": [
|
||||
"sha256:be146a2d8075d4fe372ff4e877f23c8b5af269d16705c1948123b9415f6fd678"
|
||||
@ -99,59 +107,67 @@
|
||||
},
|
||||
"asyncpg": {
|
||||
"hashes": [
|
||||
"sha256:04ff0785ae7eed6cc138e73fc67b8e51d54ee7a3ce9b63666ce55a0bf095f7ba",
|
||||
"sha256:05b185ebb8083c8568ea8a40e896d5f7af4b8554b64d7719c0eaa1eb5a5c3a70",
|
||||
"sha256:0b448f0150e1c3b96cb0438a0d0aa4871f1472e58de14a3ec320dbb2798fb0d4",
|
||||
"sha256:0f5712350388d0cd0615caec629ad53c81e506b1abaaf8d14c93f54b35e3595a",
|
||||
"sha256:1292b84ee06ac8a2ad8e51c7475aa309245874b61333d97411aab835c4a2f737",
|
||||
"sha256:1b11a555a198b08f5c4baa8f8231c74a366d190755aa4f99aacec5970afe929a",
|
||||
"sha256:1b982daf2441a0ed314bd10817f1606f1c28b1136abd9e4f11335358c2c631cb",
|
||||
"sha256:1c06a3a50d014b303e5f6fc1e5f95eb28d2cee89cf58384b700da621e5d5e547",
|
||||
"sha256:1c198a00cce9506fcd0bf219a799f38ac7a237745e1d27f0e1f66d3707c84a5a",
|
||||
"sha256:26683d3b9a62836fad771a18ecf4659a30f348a561279d6227dab96182f46144",
|
||||
"sha256:29ff1fc8b5bf724273782ff8b4f57b0f8220a1b2324184846b39d1ab4122031d",
|
||||
"sha256:3152fef2e265c9c24eec4ee3d22b4f4d2703d30614b0b6753e9ed4115c8a146f",
|
||||
"sha256:3326e6d7381799e9735ca2ec9fd7be4d5fef5dcbc3cb555d8a463d8460607956",
|
||||
"sha256:3356637f0bd830407b5597317b3cb3571387ae52ddc3bca6233682be88bbbc1f",
|
||||
"sha256:393af4e3214c8fa4c7b86da6364384c0d1b3298d45803375572f415b6f673f38",
|
||||
"sha256:46973045b567972128a27d40001124fbc821c87a6cade040cfcd4fa8a30bcdc4",
|
||||
"sha256:51da377487e249e35bd0859661f6ee2b81db11ad1f4fc036194bc9cb2ead5056",
|
||||
"sha256:574156480df14f64c2d76450a3f3aaaf26105869cad3865041156b38459e935d",
|
||||
"sha256:578445f09f45d1ad7abddbff2a3c7f7c291738fdae0abffbeb737d3fc3ab8b75",
|
||||
"sha256:5b290f4726a887f75dcd1b3006f484252db37602313f806e9ffc4e5996cfe5cb",
|
||||
"sha256:5df69d55add4efcd25ea2a3b02025b669a285b767bfbf06e356d68dbce4234ff",
|
||||
"sha256:5e0511ad3dec5f6b4f7a9e063591d407eee66b88c14e2ea636f187da1dcfff6a",
|
||||
"sha256:64e899bce0600871b55368b8483e5e3e7f1860c9482e7f12e0a771e747988168",
|
||||
"sha256:68d71a1be3d83d0570049cd1654a9bdfe506e794ecc98ad0873304a9f35e411e",
|
||||
"sha256:6c2a2ef565400234a633da0eafdce27e843836256d40705d83ab7ec42074efb3",
|
||||
"sha256:6f4e83f067b35ab5e6371f8a4c93296e0439857b4569850b178a01385e82e9ad",
|
||||
"sha256:8b684a3c858a83cd876f05958823b68e8d14ec01bb0c0d14a6704c5bf9711773",
|
||||
"sha256:9110df111cabc2ed81aad2f35394a00cadf4f2e0635603db6ebbd0fc896f46a4",
|
||||
"sha256:915aeb9f79316b43c3207363af12d0e6fd10776641a7de8a01212afd95bdf0ed",
|
||||
"sha256:9a0292c6af5c500523949155ec17b7fe01a00ace33b68a476d6b5059f9630305",
|
||||
"sha256:9b6fde867a74e8c76c71e2f64f80c64c0f3163e687f1763cfaf21633ec24ec33",
|
||||
"sha256:a3479a0d9a852c7c84e822c073622baca862d1217b10a02dd57ee4a7a081f708",
|
||||
"sha256:aa403147d3e07a267ada2ae34dfc9324e67ccc4cdca35261c8c22792ba2b10cf",
|
||||
"sha256:aca1548e43bbb9f0f627a04666fedaca23db0a31a84136ad1f868cb15deb6e3a",
|
||||
"sha256:ae374585f51c2b444510cdf3595b97ece4f233fde739aa14b50e0d64e8a7a590",
|
||||
"sha256:bc6d84136f9c4d24d358f3b02be4b6ba358abd09f80737d1ac7c444f36108454",
|
||||
"sha256:bfb4dd5ae0699bad2b233672c8fc5ccbd9ad24b89afded02341786887e37927e",
|
||||
"sha256:c42f6bb65a277ce4d93f3fba46b91a265631c8df7250592dd4f11f8b0152150f",
|
||||
"sha256:c47806b1a8cbb0a0db896f4cd34d89942effe353a5035c62734ab13b9f938da3",
|
||||
"sha256:c551e9928ab6707602f44811817f82ba3c446e018bfe1d3abecc8ba5f3eac851",
|
||||
"sha256:c7255812ac85099a0e1ffb81b10dc477b9973345793776b128a23e60148dd1af",
|
||||
"sha256:c902a60b52e506d38d7e80e0dd5399f657220f24635fee368117b8b5fce1142e",
|
||||
"sha256:db9891e2d76e6f425746c5d2da01921e9a16b5a71a1c905b13f30e12a257c4af",
|
||||
"sha256:dc1f62c792752a49f88b7e6f774c26077091b44caceb1983509edc18a2222ec0",
|
||||
"sha256:f23b836dd90bea21104f69547923a02b167d999ce053f3d502081acea2fba15b",
|
||||
"sha256:f59b430b8e27557c3fb9869222559f7417ced18688375825f8f12302c34e915e",
|
||||
"sha256:f86b0e2cd3f1249d6fe6fd6cfe0cd4538ba994e2d8249c0491925629b9104d0f",
|
||||
"sha256:fb622c94db4e13137c4c7f98834185049cc50ee01d8f657ef898b6407c7b9c50",
|
||||
"sha256:fd4406d09208d5b4a14db9a9dbb311b6d7aeeab57bded7ed2f8ea41aeef39b34"
|
||||
"sha256:027eaa61361ec735926566f995d959ade4796f6a49d3bde17e5134b9964f9ba8",
|
||||
"sha256:04d19392716af6b029411a0264d92093b6e5e8285ae97a39957b9a9c14ea72be",
|
||||
"sha256:0b17c89312c2f4ccea222a3a6571f7df65d4ba2c0e803339bfc7bed46a96d3be",
|
||||
"sha256:0bfbcc5b7ffcd9b75ab1558f00db2ae07db9c80637ad1b2469c43df79d7a5ae2",
|
||||
"sha256:0c89ccf741c067614c9b5fc7f1fc6f3b61ab05ae4aaa966e6fd6b93097c7d20d",
|
||||
"sha256:12b3b2e39dc5470abd5e98c8d3373e4b1d1234d9fbdedf538798b2c13c64460a",
|
||||
"sha256:18c83b03bc0d1b23e6230f5bf8d4f217dc9bc08644ce0502a9d91dc9e634a9c7",
|
||||
"sha256:19857a358fc811d82227449b7ca40afb46e75b33eb8897240c3839dd8b744218",
|
||||
"sha256:1b41f1afb1033f2b44f3234993b15096ddc9cd71b21a42dbd87fc6a57b43d65d",
|
||||
"sha256:22bc525ebbdc24d1261ecbf6f504998244d4e3be1721784b5f64664d61fbe602",
|
||||
"sha256:22be6e02381bab3101cd502d9297ac71e2f966c86e20e78caead9934c98a8af6",
|
||||
"sha256:2657204552b75f8288de08ca60faf4a99a65deef3a71d1467454123205a88fab",
|
||||
"sha256:2d076d42eb583601179efa246c5d7ae44614b4144bc1c7a683ad1222814ed095",
|
||||
"sha256:334dec28cf20d7f5bb9e45b39546ddf247f8042a690bff9b9573d00086e69cb5",
|
||||
"sha256:37a58919cfef2448a920df00d1b2f821762d17194d0dbf355d6dde8d952c04f9",
|
||||
"sha256:37fc6c00a814e18eef51833545d1891cac9aa69140598bb076b4cd29b3e010b9",
|
||||
"sha256:3b1fbcb0e396a5ca435a8826a87e5c2c2cc0c8c68eb6fadf82168056b0e53a8c",
|
||||
"sha256:3df118d94f46d85b2e434fd62c84cb66d5834d5a890725fe625f498e72e4d5ec",
|
||||
"sha256:3faa62f997db0c9add34504a68ac2c342cfee4d57a0c3062fcf0d86c7f9cb1e8",
|
||||
"sha256:480c4befbdf079c14c9ca43c8c5e1fe8b6296c96f1f927158d4f1e750aacc047",
|
||||
"sha256:54a64f91839ba59008eccf7aad2e93d6e3de688d796f35803235ea1c4898ae1e",
|
||||
"sha256:5a4af56edf82a701aece93190cc4e094d2df7d33f6e915c222fb09efbb5afc24",
|
||||
"sha256:6d11b198111a72f47154fa03b85799f9be63701e068b43f84ac25da0bda9cb31",
|
||||
"sha256:72d6bdcbc93d608a1158f17932de2321f68b1a967a13e014998db87a72ed3186",
|
||||
"sha256:795416369c3d284e1837461909f58418ad22b305f955e625a4b3a2521d80a5f3",
|
||||
"sha256:831712dd3cf117eec68575a9b50da711893fd63ebe277fc155ecae1c6c9f0f61",
|
||||
"sha256:8df714dba348efcc162d2adf02d213e5fab1bd9f557e1305633e851a61814a7a",
|
||||
"sha256:8ea599d45c361dfbf398cb67da7fd052affa556a401482d3ff1ee99bd68808a1",
|
||||
"sha256:9322b563e2661a52e3cdbc93eed3be7748b289f792e0011cb2720d278b366ce2",
|
||||
"sha256:98cc158c53f46de7bb677fd20c417e264fc02b36d901cc2a43bd6cb0dc6dbfd2",
|
||||
"sha256:9ea33213ac044171f4cac23740bed9a3805abae10e7025314cfbd725ec670540",
|
||||
"sha256:a429e842a3a4b4ea240ea52d7fe3f82d5149853249306f7ff166cb9948faa46c",
|
||||
"sha256:a8d758dac9d2e723e173d286ef5e574f0b350ec00e9186fce84d0fc5f6a8e6b8",
|
||||
"sha256:aad7a33913fb8bcb5454313377cc330fbb19a0cd5faa7272407d8a0c4257b671",
|
||||
"sha256:b44c31e1efc1c15188ef183f287c728e2046abb1d26af4d20858215d50d91fad",
|
||||
"sha256:ba5f8886e850882ff2c2ace5732300e99193823e8107e2c53ef01c1ebfa1e85d",
|
||||
"sha256:bb223567dea5f47c45d347f2bde5486be8d9f40339f27217adb3fb1c3be51298",
|
||||
"sha256:bc2b685f400ceae428f79f78b58110470d7b4466929a7f78d455964b17ad1008",
|
||||
"sha256:bd4107bb7cdd0e9e65fae66a62afd3a249663b844fa34d479f6d5b3bef9c04c3",
|
||||
"sha256:bd5b6efff3c17c3202d4b37189969acf8927438a238c6257f66be3c426beba20",
|
||||
"sha256:bdb957706da132e982cc6856bb2f7b740603472b54c3ebc77fe60ea3e57e1bd2",
|
||||
"sha256:bef056aa502ee34204c161c72ca1f3c274917596877f825968368b2c33f585f4",
|
||||
"sha256:c0807be46c32c963ae40d329b3a686356e417f674c976c07fa49f1b30303f109",
|
||||
"sha256:c0e0822b1038dc7253b337b0f3f676cadc4ac31b126c5d42691c39691962e403",
|
||||
"sha256:c1a9c5b71d2371a2290bc93336cd05ba4ec781683cab292adbddc084f89443c6",
|
||||
"sha256:c1e1ab5bc65373d92dd749d7308c5b26fb2dc0fbe5d3bf68a32b676aa3bcd24a",
|
||||
"sha256:c204fab1b91e08b0f47e90a75d1b3c62174dab21f670ad6c5d0f243a228f015b",
|
||||
"sha256:c989386c83940bfbd787180f2b1519415e2d3d6277a70d9d0f0145ac73500735",
|
||||
"sha256:cea3a0b2a14f95834cee29432e4ddc399b95700eb1d51bbc5bfee8f31fa07b2b",
|
||||
"sha256:dc5f2fa9916f292e5c5c8b2ac2813763bcd7f58e130055b4ad8a0531314201ab",
|
||||
"sha256:e009abc333464ff18b8f6fd146addffd9aaf63e79aa3bb40ab7a4c332d0c5e9e",
|
||||
"sha256:e5d5098f63beeae93512ee513d4c0c53dc12e9aa2b7a1af5a81cddf93fe4e4da",
|
||||
"sha256:e6974f36eb9a224d8fb428bcf66bd411aa12cf57c2967463178149e73d4de366",
|
||||
"sha256:ebb3cde58321a1f89ce41812be3f2a98dddedc1e76d0838aba1d724f1e4e1a95",
|
||||
"sha256:eee690960e8ab85063ba93af2ce128c0f52fd655fdff9fdb1a28df01329f031d",
|
||||
"sha256:f6b56b91bb0ffc328c4e3ed113136cddd9deefdf5f79ab448598b9772831df44",
|
||||
"sha256:f890de5e1e4f7e14023619399a471ce4b71f5418cd67a51853b9910fdfa73696"
|
||||
],
|
||||
"index": "pip_conf_index_global",
|
||||
"markers": "python_full_version >= '3.8.0'",
|
||||
"version": "==0.30.0"
|
||||
"markers": "python_full_version >= '3.9.0'",
|
||||
"version": "==0.31.0"
|
||||
},
|
||||
"black": {
|
||||
"hashes": [
|
||||
@ -308,6 +324,14 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==2.0.0"
|
||||
},
|
||||
"exceptiongroup": {
|
||||
"hashes": [
|
||||
"sha256:8b412432c6055b0b7d14c310000ae93352ed6754f70fa8f7c34141f91c4e3219",
|
||||
"sha256:a7a39a3bd276781e98394987d3a5701d0c4edffb633bb7a5144577f82c773598"
|
||||
],
|
||||
"markers": "python_version >= '3.7'",
|
||||
"version": "==1.3.1"
|
||||
},
|
||||
"fastapi": {
|
||||
"hashes": [
|
||||
"sha256:97ecbf994be0bcbdadedf88c3150252bed7b2087075ac99735403b1b76cc8fc0",
|
||||
@ -326,6 +350,59 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==0.0.7"
|
||||
},
|
||||
"greenlet": {
|
||||
"hashes": [
|
||||
"sha256:04633da773ae432649a3f092a8e4add390732cc9e1ab52c8ff2c91b8dc86f202",
|
||||
"sha256:04e6a202cde56043fd355fefd1552c4caa5c087528121871d950eb4f1b51fa99",
|
||||
"sha256:050703a60603db0e817364d69e048c70af299040c13a7e67792b9e62d4571196",
|
||||
"sha256:0bc06a78fa3ffbe2a75f1ebc7e040eacf6fa1050a9432953ab111fbbbf0d03c1",
|
||||
"sha256:0d2a78e6f1bf3f1672df91e212a2f8314e1e7c922f065d14cbad4bc815059467",
|
||||
"sha256:15871afc0d78ec87d15d8412b337f287fc69f8f669346e391585824970931c48",
|
||||
"sha256:2acb30e77042f747ca81f0a10cc153296567e92e666c5e1b117f4595afd43352",
|
||||
"sha256:2c7429f6e9cea7cbf2637d86d3db12806ba970f7f972fcab39d6b54b4457cbaf",
|
||||
"sha256:34cc7cf8ab6f4b85298b01e13e881265ee7b3c1daf6bc10a2944abc15d4f87c3",
|
||||
"sha256:3828b309dfb1f117fe54867512a8265d8d4f00f8de6908eef9b885f4d8789062",
|
||||
"sha256:393c03c26c865f17f31d8db2f09603fadbe0581ad85a5d5908b131549fc38217",
|
||||
"sha256:4544ab2cfd5912e42458b13516429e029f87d8bbcdc8d5506db772941ae12493",
|
||||
"sha256:45fcea7b697b91290b36eafc12fff479aca6ba6500d98ef6f34d5634c7119cbe",
|
||||
"sha256:472841de62d60f2cafd60edd4fd4dd7253eb70e6eaf14b8990dcaf177f4af957",
|
||||
"sha256:499b809e7738c8af0ff9ac9d5dd821cb93f4293065a9237543217f0b252f950a",
|
||||
"sha256:5bf0d7d62e356ef2e87e55e46a4e930ac165f9372760fb983b5631bb479e9d3a",
|
||||
"sha256:5ceb29d1f74c7280befbbfa27b9bf91ba4a07a1a00b2179a5d953fc219b16c42",
|
||||
"sha256:60c06b502d56d5451f60ca665691da29f79ed95e247bcf8ce5024d7bbe64acb9",
|
||||
"sha256:6712bfd520530eb67331813f7112d3ee18e206f48b3d026d8a96cd2d2ad20251",
|
||||
"sha256:67725ae9fea62c95cf1aa230f1b8d4dc38f7cd14f6103d1df8a5a95657eb8e54",
|
||||
"sha256:6dff6433742073e5b6ad40953a78a0e8cddcb3f6869e5ea635d29a810ca5e7d0",
|
||||
"sha256:6e8fe0c72603201a86b2e038daf9b6c8570715f8779566419cff543b6ace88de",
|
||||
"sha256:7123b29e6bad2f3f89681be4ef316480fca798ebe8d22fbaced9cc3775007a4f",
|
||||
"sha256:752c896a8c976548faafe8a306d446c6a4c68d4fd24699b84d4393bd9ac69a8e",
|
||||
"sha256:7d951e7d628a6e8b68af469f0fe4f100ef64c4054abeb9cdafbfaa30a920c950",
|
||||
"sha256:87b791dd0e031a574249af717ac36f7031b18c35329561c1e0368201c18caf1f",
|
||||
"sha256:a145f4b1c4ed7a2c94561b7f18b4beec3d3fb6f0580db22f7ed1d544e0620b34",
|
||||
"sha256:a5e4b25e855800fba17713020c5c33e0a4b7a1829027719344f0c7c8870092a2",
|
||||
"sha256:ac8db07bced2c39b987bba13a3195f8157b0cfbce54488f86919321444a1cc3c",
|
||||
"sha256:acabf468466d18017e2ae5fbf1a5a88b86b48983e550e1ae1437b69a83d9f4ac",
|
||||
"sha256:bd593db7ee1fa8a513a48a404f8cc4126998a48025e3f5cbbc68d51be0a6bf66",
|
||||
"sha256:bdd67619cefe1cc9fcab57c8853d2bb36eca9f166c0058cc0d428d471f7c785c",
|
||||
"sha256:c11fe0cfb0ce33132f0b5d27eeadd1954976a82e5e9b60909ec2c4b884a55382",
|
||||
"sha256:c5445ddb7b586d870dad32ca9fc47c287d6022a528d194efdb8912093c5303ad",
|
||||
"sha256:c816554eb33e7ecf9ba4defcb1fd8c994e59be6b4110da15480b3e7447ea4286",
|
||||
"sha256:c8317d732e2ae0935d9ed2af2ea876fa714cf6f3b887a31ca150b54329b0a6e9",
|
||||
"sha256:cc1d01bdd67db3e5711e6246e451d7a0f75fae7bbf40adde129296a7f9aa7cc9",
|
||||
"sha256:ce8aed6fdd5e07d3cbb988cbdc188266a4eb9e1a52db9ef5c6526e59962d3933",
|
||||
"sha256:d5583b2ffa677578a384337ee13125bdf9a427485d689014b39d638a4f3d8dbe",
|
||||
"sha256:d7456e67b0be653dfe643bb37d9566cd30939c80f858e2ce6d2d54951f75b14a",
|
||||
"sha256:dbe0e81e24982bb45907ca20152b31c2e3300ca352fdc4acbd4956e4a2cbc195",
|
||||
"sha256:e3f03ddd7142c758ab41c18089a1407b9959bd276b4e6dfbd8fd06403832c87a",
|
||||
"sha256:e66872daffa360b2537170b73ad530f14fa31785b1bc78080125d92edf0a6def",
|
||||
"sha256:edbf4ab9a7057ee430a678fe2ef37ea5d69125d6bdc7feb42ed8d871c737e63b",
|
||||
"sha256:f2cc88b50b9006b324c1b9f5f3552f9d4564c78af57cdfb4c7baf4f0aa089146",
|
||||
"sha256:f96e2bb8a56b7e1aed1dbfbbe0050cb2ecca99c7c91892fd1771e3afab63b3e3",
|
||||
"sha256:fd904626b8779810062cb455514594776e3cba3b8c0ba4939894df9f7b384971"
|
||||
],
|
||||
"markers": "python_version >= '3.9'",
|
||||
"version": "==3.2.5"
|
||||
},
|
||||
"h11": {
|
||||
"hashes": [
|
||||
"sha256:8f19fbbe99e72420ff35c00b27a34cb9937e902a8b810e2c88300c6f0a3b699d",
|
||||
@ -545,83 +622,54 @@
|
||||
},
|
||||
"numpy": {
|
||||
"hashes": [
|
||||
"sha256:07b62978075b67eee4065b166d000d457c82a1efe726cce608b9db9dd66a73a5",
|
||||
"sha256:087ffc25890d89a43536f75c5fe8770922008758e8eeeef61733957041ed2f9b",
|
||||
"sha256:092aeb3449833ea9c0bf0089d70c29ae480685dd2377ec9cdbbb620257f84631",
|
||||
"sha256:095737ed986e00393ec18ec0b21b47c22889ae4b0cd2d5e88342e08b01141f58",
|
||||
"sha256:0a4f2021a6da53a0d580d6ef5db29947025ae8b35b3250141805ea9a32bbe86b",
|
||||
"sha256:103ea7063fa624af04a791c39f97070bf93b96d7af7eb23530cd087dc8dbe9dc",
|
||||
"sha256:11e58218c0c46c80509186e460d79fbdc9ca1eb8d8aee39d8f2dc768eb781089",
|
||||
"sha256:122bf5ed9a0221b3419672493878ba4967121514b1d7d4656a7580cd11dddcbf",
|
||||
"sha256:14a91ebac98813a49bc6aa1a0dfc09513dcec1d97eaf31ca21a87221a1cdcb15",
|
||||
"sha256:1f91e5c028504660d606340a084db4b216567ded1056ea2b4be4f9d10b67197f",
|
||||
"sha256:20b8200721840f5621b7bd03f8dcd78de33ec522fc40dc2641aa09537df010c3",
|
||||
"sha256:240259d6564f1c65424bcd10f435145a7644a65a6811cfc3201c4a429ba79170",
|
||||
"sha256:2738534837c6a1d0c39340a190177d7d66fdf432894f469728da901f8f6dc910",
|
||||
"sha256:27c9f90e7481275c7800dc9c24b7cc40ace3fdb970ae4d21eaff983a32f70c91",
|
||||
"sha256:293b2192c6bcce487dbc6326de5853787f870aeb6c43f8f9c6496db5b1781e45",
|
||||
"sha256:2c3271cc4097beb5a60f010bcc1cc204b300bb3eafb4399376418a83a1c6373c",
|
||||
"sha256:2f4f0215edb189048a3c03bd5b19345bdfa7b45a7a6f72ae5945d2a28272727f",
|
||||
"sha256:3dcf02866b977a38ba3ec10215220609ab9667378a9e2150615673f3ffd6c73b",
|
||||
"sha256:4209f874d45f921bde2cff1ffcd8a3695f545ad2ffbef6d3d3c6768162efab89",
|
||||
"sha256:448a66d052d0cf14ce9865d159bfc403282c9bc7bb2a31b03cc18b651eca8b1a",
|
||||
"sha256:4ae6863868aaee2f57503c7a5052b3a2807cf7a3914475e637a0ecd366ced220",
|
||||
"sha256:4d002ecf7c9b53240be3bb69d80f86ddbd34078bae04d87be81c1f58466f264e",
|
||||
"sha256:4e6ecfeddfa83b02318f4d84acf15fbdbf9ded18e46989a15a8b6995dfbf85ab",
|
||||
"sha256:508b0eada3eded10a3b55725b40806a4b855961040180028f52580c4729916a2",
|
||||
"sha256:546aaf78e81b4081b2eba1d105c3b34064783027a06b3ab20b6eba21fb64132b",
|
||||
"sha256:572d5512df5470f50ada8d1972c5f1082d9a0b7aa5944db8084077570cf98370",
|
||||
"sha256:5ad4ebcb683a1f99f4f392cc522ee20a18b2bb12a2c1c42c3d48d5a1adc9d3d2",
|
||||
"sha256:66459dccc65d8ec98cc7df61307b64bf9e08101f9598755d42d8ae65d9a7a6ee",
|
||||
"sha256:6936aff90dda378c09bea075af0d9c675fe3a977a9d2402f95a87f440f59f619",
|
||||
"sha256:69779198d9caee6e547adb933941ed7520f896fd9656834c300bdf4dd8642712",
|
||||
"sha256:6f1ae3dcb840edccc45af496f312528c15b1f79ac318169d094e85e4bb35fdf1",
|
||||
"sha256:71669b5daae692189540cffc4c439468d35a3f84f0c88b078ecd94337f6cb0ec",
|
||||
"sha256:72c6df2267e926a6d5286b0a6d556ebe49eae261062059317837fda12ddf0c1a",
|
||||
"sha256:72dbebb2dcc8305c431b2836bcc66af967df91be793d63a24e3d9b741374c450",
|
||||
"sha256:754d6755d9a7588bdc6ac47dc4ee97867271b17cee39cb87aef079574366db0a",
|
||||
"sha256:76c3e9501ceb50b2ff3824c3589d5d1ab4ac857b0ee3f8f49629d0de55ecf7c2",
|
||||
"sha256:7a0e27186e781a69959d0230dd9909b5e26024f8da10683bd6344baea1885168",
|
||||
"sha256:7d6e390423cc1f76e1b8108c9b6889d20a7a1f59d9a60cac4a050fa734d6c1e2",
|
||||
"sha256:8145dd6d10df13c559d1e4314df29695613575183fa2e2d11fac4c208c8a1f73",
|
||||
"sha256:8446acd11fe3dc1830568c941d44449fd5cb83068e5c70bd5a470d323d448296",
|
||||
"sha256:852ae5bed3478b92f093e30f785c98e0cb62fa0a939ed057c31716e18a7a22b9",
|
||||
"sha256:87c930d52f45df092f7578889711a0768094debf73cfcde105e2d66954358125",
|
||||
"sha256:8b1224a734cd509f70816455c3cffe13a4f599b1bf7130f913ba0e2c0b2006c0",
|
||||
"sha256:8dc082ea901a62edb8f59713c6a7e28a85daddcb67454c839de57656478f5b19",
|
||||
"sha256:906a30249315f9c8e17b085cc5f87d3f369b35fedd0051d4a84686967bdbbd0b",
|
||||
"sha256:938065908d1d869c7d75d8ec45f735a034771c6ea07088867f713d1cd3bbbe4f",
|
||||
"sha256:9c144440db4bf3bb6372d2c3e49834cc0ff7bb4c24975ab33e01199e645416f2",
|
||||
"sha256:9e196ade2400c0c737d93465327d1ae7c06c7cb8a1756121ebf54b06ca183c7f",
|
||||
"sha256:a3ef07ec8cbc8fc9e369c8dcd52019510c12da4de81367d8b20bc692aa07573a",
|
||||
"sha256:a7af9ed2aa9ec5950daf05bb11abc4076a108bd3c7db9aa7251d5f107079b6a6",
|
||||
"sha256:a9f66e7d2b2d7712410d3bc5684149040ef5f19856f20277cd17ea83e5006286",
|
||||
"sha256:aa098a5ab53fa407fded5870865c6275a5cd4101cfdef8d6fafc48286a96e981",
|
||||
"sha256:af58de8745f7fa9ca1c0c7c943616c6fe28e75d0c81f5c295810e3c83b5be92f",
|
||||
"sha256:b05a89f2fb84d21235f93de47129dd4f11c16f64c87c33f5e284e6a3a54e43f2",
|
||||
"sha256:b5e40e80299607f597e1a8a247ff8d71d79c5b52baa11cc1cce30aa92d2da6e0",
|
||||
"sha256:b9d0878b21e3918d76d2209c924ebb272340da1fb51abc00f986c258cd5e957b",
|
||||
"sha256:bc3186bea41fae9d8e90c2b4fb5f0a1f5a690682da79b92574d63f56b529080b",
|
||||
"sha256:c63d95dc9d67b676e9108fe0d2182987ccb0f11933c1e8959f42fa0da8d4fa56",
|
||||
"sha256:c771cfac34a4f2c0de8e8c97312d07d64fd8f8ed45bc9f5726a7e947270152b5",
|
||||
"sha256:c8d9727f5316a256425892b043736d63e89ed15bbfe6556c5ff4d9d4448ff3b3",
|
||||
"sha256:cbc95b3813920145032412f7e33d12080f11dc776262df1712e1638207dde9e8",
|
||||
"sha256:cefc2219baa48e468e3db7e706305fcd0c095534a192a08f31e98d83a7d45fb0",
|
||||
"sha256:d95f59afe7f808c103be692175008bab926b59309ade3e6d25009e9a171f7036",
|
||||
"sha256:dd937f088a2df683cbb79dda9a772b62a3e5a8a7e76690612c2737f38c6ef1b6",
|
||||
"sha256:de6ea4e5a65d5a90c7d286ddff2b87f3f4ad61faa3db8dabe936b34c2275b6f8",
|
||||
"sha256:e0486a11ec30cdecb53f184d496d1c6a20786c81e55e41640270130056f8ee48",
|
||||
"sha256:ee807923782faaf60d0d7331f5e86da7d5e3079e28b291973c545476c2b00d07",
|
||||
"sha256:efc81393f25f14d11c9d161e46e6ee348637c0a1e8a54bf9dedc472a3fae993b",
|
||||
"sha256:f0a1a8476ad77a228e41619af2fa9505cf69df928e9aaa165746584ea17fed2b",
|
||||
"sha256:f75018be4980a7324edc5930fe39aa391d5734531b1926968605416ff58c332d",
|
||||
"sha256:f92d6c2a8535dc4fe4419562294ff957f83a16ebdec66df0805e473ffaad8bd0",
|
||||
"sha256:fb1752a3bb9a3ad2d6b090b88a9a0ae1cd6f004ef95f75825e2f382c183b2097",
|
||||
"sha256:fc927d7f289d14f5e037be917539620603294454130b6de200091e23d27dc9be",
|
||||
"sha256:fed5527c4cf10f16c6d0b6bee1f89958bccb0ad2522c8cadc2efd318bcd545f5"
|
||||
"sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a",
|
||||
"sha256:11a76c372d1d37437857280aa142086476136a8c0f373b2e648ab2c8f18fb195",
|
||||
"sha256:13e689d772146140a252c3a28501da66dfecd77490b498b168b501835041f951",
|
||||
"sha256:1e795a8be3ddbac43274f18588329c72939870a16cae810c2b73461c40718ab1",
|
||||
"sha256:26df23238872200f63518dd2aa984cfca675d82469535dc7162dc2ee52d9dd5c",
|
||||
"sha256:286cd40ce2b7d652a6f22efdfc6d1edf879440e53e76a75955bc0c826c7e64dc",
|
||||
"sha256:2b2955fa6f11907cf7a70dab0d0755159bca87755e831e47932367fc8f2f2d0b",
|
||||
"sha256:2da5960c3cf0df7eafefd806d4e612c5e19358de82cb3c343631188991566ccd",
|
||||
"sha256:312950fdd060354350ed123c0e25a71327d3711584beaef30cdaa93320c392d4",
|
||||
"sha256:423e89b23490805d2a5a96fe40ec507407b8ee786d66f7328be214f9679df6dd",
|
||||
"sha256:496f71341824ed9f3d2fd36cf3ac57ae2e0165c143b55c3a035ee219413f3318",
|
||||
"sha256:49ca4decb342d66018b01932139c0961a8f9ddc7589611158cb3c27cbcf76448",
|
||||
"sha256:51129a29dbe56f9ca83438b706e2e69a39892b5eda6cedcb6b0c9fdc9b0d3ece",
|
||||
"sha256:5fec9451a7789926bcf7c2b8d187292c9f93ea30284802a0ab3f5be8ab36865d",
|
||||
"sha256:671bec6496f83202ed2d3c8fdc486a8fc86942f2e69ff0e986140339a63bcbe5",
|
||||
"sha256:7f0a0c6f12e07fa94133c8a67404322845220c06a9e80e85999afe727f7438b8",
|
||||
"sha256:807ec44583fd708a21d4a11d94aedf2f4f3c3719035c76a2bbe1fe8e217bdc57",
|
||||
"sha256:883c987dee1880e2a864ab0dc9892292582510604156762362d9326444636e78",
|
||||
"sha256:8c5713284ce4e282544c68d1c3b2c7161d38c256d2eefc93c1d683cf47683e66",
|
||||
"sha256:8cafab480740e22f8d833acefed5cc87ce276f4ece12fdaa2e8903db2f82897a",
|
||||
"sha256:8df823f570d9adf0978347d1f926b2a867d5608f434a7cff7f7908c6570dcf5e",
|
||||
"sha256:9059e10581ce4093f735ed23f3b9d283b9d517ff46009ddd485f1747eb22653c",
|
||||
"sha256:905d16e0c60200656500c95b6b8dca5d109e23cb24abc701d41c02d74c6b3afa",
|
||||
"sha256:9189427407d88ff25ecf8f12469d4d39d35bee1db5d39fc5c168c6f088a6956d",
|
||||
"sha256:96a55f64139912d61de9137f11bf39a55ec8faec288c75a54f93dfd39f7eb40c",
|
||||
"sha256:97032a27bd9d8988b9a97a8c4d2c9f2c15a81f61e2f21404d7e8ef00cb5be729",
|
||||
"sha256:984d96121c9f9616cd33fbd0618b7f08e0cfc9600a7ee1d6fd9b239186d19d97",
|
||||
"sha256:9a92ae5c14811e390f3767053ff54eaee3bf84576d99a2456391401323f4ec2c",
|
||||
"sha256:9ea91dfb7c3d1c56a0e55657c0afb38cf1eeae4544c208dc465c3c9f3a7c09f9",
|
||||
"sha256:a15f476a45e6e5a3a79d8a14e62161d27ad897381fecfa4a09ed5322f2085669",
|
||||
"sha256:a392a68bd329eafac5817e5aefeb39038c48b671afd242710b451e76090e81f4",
|
||||
"sha256:a3f4ab0caa7f053f6797fcd4e1e25caee367db3112ef2b6ef82d749530768c73",
|
||||
"sha256:a46288ec55ebbd58947d31d72be2c63cbf839f0a63b49cb755022310792a3385",
|
||||
"sha256:a61ec659f68ae254e4d237816e33171497e978140353c0c2038d46e63282d0c8",
|
||||
"sha256:a842d573724391493a97a62ebbb8e731f8a5dcc5d285dfc99141ca15a3302d0c",
|
||||
"sha256:becfae3ddd30736fe1889a37f1f580e245ba79a5855bff5f2a29cb3ccc22dd7b",
|
||||
"sha256:c05e238064fc0610c840d1cf6a13bf63d7e391717d247f1bf0318172e759e692",
|
||||
"sha256:c1c9307701fec8f3f7a1e6711f9089c06e6284b3afbbcd259f7791282d660a15",
|
||||
"sha256:c7b0be4ef08607dd04da4092faee0b86607f111d5ae68036f16cc787e250a131",
|
||||
"sha256:cfd41e13fdc257aa5778496b8caa5e856dc4896d4ccf01841daee1d96465467a",
|
||||
"sha256:d731a1c6116ba289c1e9ee714b08a8ff882944d4ad631fd411106a30f083c326",
|
||||
"sha256:df55d490dea7934f330006d0f81e8551ba6010a5bf035a249ef61a94f21c500b",
|
||||
"sha256:ec9852fb39354b5a45a80bdab5ac02dd02b15f44b3804e9f00c556bf24b4bded",
|
||||
"sha256:f15975dfec0cf2239224d80e32c3170b1d168335eaedee69da84fbe9f1f9cd04",
|
||||
"sha256:f26b258c385842546006213344c50655ff1555a9338e2e5e02a0756dc3e803dd"
|
||||
],
|
||||
"markers": "python_version >= '3.11'",
|
||||
"version": "==2.3.2"
|
||||
"markers": "python_version >= '3.9'",
|
||||
"version": "==2.0.2"
|
||||
},
|
||||
"openpyxl": {
|
||||
"hashes": [
|
||||
@ -725,52 +773,65 @@
|
||||
},
|
||||
"pandas": {
|
||||
"hashes": [
|
||||
"sha256:025e92411c16cbe5bb2a4abc99732a6b132f439b8aab23a59fa593eb00704232",
|
||||
"sha256:09e3b1587f0f3b0913e21e8b32c3119174551deb4a4eba4a89bc7377947977e7",
|
||||
"sha256:0a95b9ac964fe83ce317827f80304d37388ea77616b1425f0ae41c9d2d0d7bb2",
|
||||
"sha256:0f951fbb702dacd390561e0ea45cdd8ecfa7fb56935eb3dd78e306c19104b9b0",
|
||||
"sha256:1b916a627919a247d865aed068eb65eb91a344b13f5b57ab9f610b7716c92de1",
|
||||
"sha256:1c78cf43c8fde236342a1cb2c34bcff89564a7bfed7e474ed2fffa6aed03a956",
|
||||
"sha256:1d12f618d80379fde6af007f65f0c25bd3e40251dbd1636480dfffce2cf1e6da",
|
||||
"sha256:22c2e866f7209ebc3a8f08d75766566aae02bcc91d196935a1d9e59c7b990ac9",
|
||||
"sha256:2323294c73ed50f612f67e2bf3ae45aea04dce5690778e08a09391897f35ff88",
|
||||
"sha256:2b0540963d83431f5ce8870ea02a7430adca100cec8a050f0811f8e31035541b",
|
||||
"sha256:2ba6aff74075311fc88504b1db890187a3cd0f887a5b10f5525f8e2ef55bfdb9",
|
||||
"sha256:2eb789ae0274672acbd3c575b0598d213345660120a257b47b5dafdc618aec83",
|
||||
"sha256:2f4d6feeba91744872a600e6edbbd5b033005b431d5ae8379abee5bcfa479fab",
|
||||
"sha256:342e59589cc454aaff7484d75b816a433350b3d7964d7847327edda4d532a2e3",
|
||||
"sha256:3462c3735fe19f2638f2c3a40bd94ec2dc5ba13abbb032dd2fa1f540a075509d",
|
||||
"sha256:3583d348546201aff730c8c47e49bc159833f971c2899d6097bce68b9112a4f1",
|
||||
"sha256:4645f770f98d656f11c69e81aeb21c6fca076a44bed3dcbb9396a4311bc7f6d8",
|
||||
"sha256:4d544806b485ddf29e52d75b1f559142514e60ef58a832f74fb38e48d757b299",
|
||||
"sha256:56a342b231e8862c96bdb6ab97170e203ce511f4d0429589c8ede1ee8ece48b8",
|
||||
"sha256:5db9637dbc24b631ff3707269ae4559bce4b7fd75c1c4d7e13f40edc42df4444",
|
||||
"sha256:689968e841136f9e542020698ee1c4fbe9caa2ed2213ae2388dc7b81721510d3",
|
||||
"sha256:6de8547d4fdb12421e2d047a2c446c623ff4c11f47fddb6b9169eb98ffba485a",
|
||||
"sha256:6f3bf5ec947526106399a9e1d26d40ee2b259c66422efdf4de63c848492d91bb",
|
||||
"sha256:782647ddc63c83133b2506912cc6b108140a38a37292102aaa19c81c83db2928",
|
||||
"sha256:7dcb79bf373a47d2a40cf7232928eb7540155abbc460925c2c96d2d30b006eb4",
|
||||
"sha256:8dfc17328e8da77be3cf9f47509e5637ba8f137148ed0e9b5241e1baf526e20a",
|
||||
"sha256:9026bd4a80108fac2239294a15ef9003c4ee191a0f64b90f170b40cfb7cf2d22",
|
||||
"sha256:911580460fc4884d9b05254b38a6bfadddfcc6aaef856fb5859e7ca202e45275",
|
||||
"sha256:98bcc8b5bf7afed22cc753a28bc4d9e26e078e777066bc53fac7904ddef9a678",
|
||||
"sha256:9b7ff55f31c4fcb3e316e8f7fa194566b286d6ac430afec0d461163312c5841e",
|
||||
"sha256:ac942bfd0aca577bef61f2bc8da8147c4ef6879965ef883d8e8d5d2dc3e744b8",
|
||||
"sha256:b3cd4273d3cb3707b6fffd217204c52ed92859533e31dc03b7c5008aa933aaab",
|
||||
"sha256:b4b0de34dc8499c2db34000ef8baad684cfa4cbd836ecee05f323ebfba348c7d",
|
||||
"sha256:ca7ed14832bce68baef331f4d7f294411bed8efd032f8109d690df45e00c4679",
|
||||
"sha256:cd05b72ec02ebfb993569b4931b2e16fbb4d6ad6ce80224a3ee838387d83a191",
|
||||
"sha256:dd71c47a911da120d72ef173aeac0bf5241423f9bfea57320110a978457e069e",
|
||||
"sha256:e5635178b387bd2ba4ac040f82bc2ef6e6b500483975c4ebacd34bec945fda12",
|
||||
"sha256:e6723a27ad7b244c0c79d8e7007092d7c8f0f11305770e2f4cd778b3ad5f9f85",
|
||||
"sha256:ec6c851509364c59a5344458ab935e6451b31b818be467eb24b0fe89bd05b6b9",
|
||||
"sha256:fe37e757f462d31a9cd7580236a82f353f5713a80e059a29753cf938c6775d96",
|
||||
"sha256:fe67dc676818c186d5a3d5425250e40f179c2a89145df477dd82945eaea89e97",
|
||||
"sha256:fe7317f578c6a153912bd2292f02e40c1d8f253e93c599e82620c7f69755c74f"
|
||||
"sha256:0242fe9a49aa8b4d78a4fa03acb397a58833ef6199e9aa40a95f027bb3a1b6e7",
|
||||
"sha256:1611aedd912e1ff81ff41c745822980c49ce4a7907537be8692c8dbc31924593",
|
||||
"sha256:1b07204a219b3b7350abaae088f451860223a52cfb8a6c53358e7948735158e5",
|
||||
"sha256:1d37b5848ba49824e5c30bedb9c830ab9b7751fd049bc7914533e01c65f79791",
|
||||
"sha256:23ebd657a4d38268c7dfbdf089fbc31ea709d82e4923c5ffd4fbd5747133ce73",
|
||||
"sha256:2462b1a365b6109d275250baaae7b760fd25c726aaca0054649286bcfbb3e8ec",
|
||||
"sha256:28083c648d9a99a5dd035ec125d42439c6c1c525098c58af0fc38dd1a7a1b3d4",
|
||||
"sha256:2e3ebdb170b5ef78f19bfb71b0dc5dc58775032361fa188e814959b74d726dd5",
|
||||
"sha256:318d77e0e42a628c04dc56bcef4b40de67918f7041c2b061af1da41dcff670ac",
|
||||
"sha256:371a4ab48e950033bcf52b6527eccb564f52dc826c02afd9a1bc0ab731bba084",
|
||||
"sha256:376c6446ae31770764215a6c937f72d917f214b43560603cd60da6408f183b6c",
|
||||
"sha256:3869faf4bd07b3b66a9f462417d0ca3a9df29a9f6abd5d0d0dbab15dac7abe87",
|
||||
"sha256:3fd2f887589c7aa868e02632612ba39acb0b8948faf5cc58f0850e165bd46f35",
|
||||
"sha256:4793891684806ae50d1288c9bae9330293ab4e083ccd1c5e383c34549c6e4250",
|
||||
"sha256:4e0a175408804d566144e170d0476b15d78458795bb18f1304fb94160cabf40c",
|
||||
"sha256:503cf027cf9940d2ceaa1a93cfb5f8c8c7e6e90720a2850378f0b3f3b1e06826",
|
||||
"sha256:5554c929ccc317d41a5e3d1234f3be588248e61f08a74dd17c9eabb535777dc9",
|
||||
"sha256:56851a737e3470de7fa88e6131f41281ed440d29a9268dcbf0002da5ac366713",
|
||||
"sha256:5caf26f64126b6c7aec964f74266f435afef1c1b13da3b0636c7518a1fa3e2b1",
|
||||
"sha256:602b8615ebcc4a0c1751e71840428ddebeb142ec02c786e8ad6b1ce3c8dec523",
|
||||
"sha256:6253c72c6a1d990a410bc7de641d34053364ef8bcd3126f7e7450125887dffe3",
|
||||
"sha256:6435cb949cb34ec11cc9860246ccb2fdc9ecd742c12d3304989017d53f039a78",
|
||||
"sha256:6d21f6d74eb1725c2efaa71a2bfc661a0689579b58e9c0ca58a739ff0b002b53",
|
||||
"sha256:6d2cefc361461662ac48810cb14365a365ce864afe85ef1f447ff5a1e99ea81c",
|
||||
"sha256:74ecdf1d301e812db96a465a525952f4dde225fdb6d8e5a521d47e1f42041e21",
|
||||
"sha256:75ea25f9529fdec2d2e93a42c523962261e567d250b0013b16210e1d40d7c2e5",
|
||||
"sha256:854d00d556406bffe66a4c0802f334c9ad5a96b4f1f868adf036a21b11ef13ff",
|
||||
"sha256:8fe25fc7b623b0ef6b5009149627e34d2a4657e880948ec3c840e9402e5c1b45",
|
||||
"sha256:900f47d8f20860de523a1ac881c4c36d65efcb2eb850e6948140fa781736e110",
|
||||
"sha256:93c2d9ab0fc11822b5eece72ec9587e172f63cff87c00b062f6e37448ced4493",
|
||||
"sha256:a16dcec078a01eeef8ee61bf64074b4e524a2a3f4b3be9326420cabe59c4778b",
|
||||
"sha256:a21d830e78df0a515db2b3d2f5570610f5e6bd2e27749770e8bb7b524b89b450",
|
||||
"sha256:a45c765238e2ed7d7c608fc5bc4a6f88b642f2f01e70c0c23d2224dd21829d86",
|
||||
"sha256:a637c5cdfa04b6d6e2ecedcb81fc52ffb0fd78ce2ebccc9ea964df9f658de8c8",
|
||||
"sha256:a68e15f780eddf2b07d242e17a04aa187a7ee12b40b930bfdd78070556550e98",
|
||||
"sha256:b3d11d2fda7eb164ef27ffc14b4fcab16a80e1ce67e9f57e19ec0afaf715ba89",
|
||||
"sha256:b468d3dad6ff947df92dcb32ede5b7bd41a9b3cceef0a30ed925f6d01fb8fa66",
|
||||
"sha256:b98560e98cb334799c0b07ca7967ac361a47326e9b4e5a7dfb5ab2b1c9d35a1b",
|
||||
"sha256:bdcd9d1167f4885211e401b3036c0c8d9e274eee67ea8d0758a256d60704cfe8",
|
||||
"sha256:bf1f8a81d04ca90e32a0aceb819d34dbd378a98bf923b6398b9a3ec0bf44de29",
|
||||
"sha256:c46467899aaa4da076d5abc11084634e2d197e9460643dd455ac3db5856b24d6",
|
||||
"sha256:c4fc4c21971a1a9f4bdb4c73978c7f7256caa3e62b323f70d6cb80db583350bc",
|
||||
"sha256:c503ba5216814e295f40711470446bc3fd00f0faea8a086cbc688808e26f92a2",
|
||||
"sha256:d051c0e065b94b7a3cea50eb1ec32e912cd96dba41647eb24104b6c6c14c5788",
|
||||
"sha256:d3e28b3e83862ccf4d85ff19cf8c20b2ae7e503881711ff2d534dc8f761131aa",
|
||||
"sha256:db4301b2d1f926ae677a751eb2bd0e8c5f5319c9cb3f88b0becbbb0b07b34151",
|
||||
"sha256:dd7478f1463441ae4ca7308a70e90b33470fa593429f9d4c578dd00d1fa78838",
|
||||
"sha256:e05e1af93b977f7eafa636d043f9f94c7ee3ac81af99c13508215942e64c993b",
|
||||
"sha256:e19d192383eab2f4ceb30b412b22ea30690c9e618f78870357ae1d682912015a",
|
||||
"sha256:e32e7cc9af0f1cc15548288a51a3b681cc2a219faa838e995f7dc53dbab1062d",
|
||||
"sha256:ecaf1e12bdc03c86ad4a7ea848d66c685cb6851d807a26aa245ca3d2017a1908",
|
||||
"sha256:ee15f284898e7b246df8087fc82b87b01686f98ee67d85a17b7ab44143a3a9a0",
|
||||
"sha256:ee67acbbf05014ea6c763beb097e03cd629961c8a632075eeb34247120abcb4b",
|
||||
"sha256:f086f6fe114e19d92014a1966f43a3e62285109afe874f067f5abbdcbb10e59c",
|
||||
"sha256:f8bfc0e12dc78f777f323f55c58649591b2cd0c43534e8355c51d3fede5f4dee"
|
||||
],
|
||||
"index": "pip_conf_index_global",
|
||||
"markers": "python_version >= '3.9'",
|
||||
"version": "==2.3.1"
|
||||
"version": "==2.3.3"
|
||||
},
|
||||
"passlib": {
|
||||
"hashes": [
|
||||
@ -798,6 +859,21 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==4.3.6"
|
||||
},
|
||||
"playwright": {
|
||||
"hashes": [
|
||||
"sha256:1dd93b265688da46e91ecb0606d36f777f8eadcf7fbef12f6426b20bf0c9137c",
|
||||
"sha256:284ed5a706b7c389a06caa431b2f0ba9ac4130113c3a779767dda758c2497bb1",
|
||||
"sha256:38a1bae6c0a07839cdeaddbc0756b3b2b85e476c07945f64ece08f1f956a86f1",
|
||||
"sha256:5f065f5a133dbc15e6e7c71e7bc04f258195755b1c32a432b792e28338c8335e",
|
||||
"sha256:6caefb08ed2c6f29d33b8088d05d09376946e49a73be19271c8cd5384b82b14c",
|
||||
"sha256:9351c1ac3dfd9b3820fe7fc4340d96c0d3736bb68097b9b7a69bd45d25e9370c",
|
||||
"sha256:99104771abc4eafee48f47dac2369e0015516dc1ce8c409807d2dd440828b9a4",
|
||||
"sha256:a4a9d65027bce48eeba842408bcc1421502dfd7e41e28d207e94260fa93ca67e"
|
||||
],
|
||||
"index": "pip_conf_index_global",
|
||||
"markers": "python_version >= '3.9'",
|
||||
"version": "==1.57.0"
|
||||
},
|
||||
"pycparser": {
|
||||
"hashes": [
|
||||
"sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6",
|
||||
@ -932,6 +1008,21 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==2.7.1"
|
||||
},
|
||||
"pyee": {
|
||||
"hashes": [
|
||||
"sha256:0b931f7c14535667ed4c7e0d531716368715e860b988770fc7eb8578d1f67fc8",
|
||||
"sha256:af2f8fede4171ef667dfded53f96e2ed0d6e6bd7ee3bb46437f77e3b57689228"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==13.0.1"
|
||||
},
|
||||
"pyexecjs": {
|
||||
"hashes": [
|
||||
"sha256:34cc1d070976918183ff7bdc0ad71f8157a891c92708c00c5fbbff7a769f505c"
|
||||
],
|
||||
"index": "pip_conf_index_global",
|
||||
"version": "==1.5.1"
|
||||
},
|
||||
"pygments": {
|
||||
"hashes": [
|
||||
"sha256:61c16d2a8576dc0649d9f39e089b5f02bcd27fba10d8fb4dcc28173f7a45151f",
|
||||
@ -959,6 +1050,16 @@
|
||||
"markers": "python_version >= '3.8' and python_version < '4.0'",
|
||||
"version": "==0.3.2"
|
||||
},
|
||||
"pysocks": {
|
||||
"hashes": [
|
||||
"sha256:08e69f092cc6dbe92a0fdd16eeb9b9ffbc13cadfe5ca4c7bd92ffb078b293299",
|
||||
"sha256:2725bd0a9925919b9b51739eea5f9e2bae91e83288108a9ad338b2e3a4435ee5",
|
||||
"sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"
|
||||
],
|
||||
"index": "pip_conf_index_global",
|
||||
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'",
|
||||
"version": "==1.7.1"
|
||||
},
|
||||
"python-dateutil": {
|
||||
"hashes": [
|
||||
"sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3",
|
||||
@ -1202,6 +1303,59 @@
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==0.37.2"
|
||||
},
|
||||
"tomli": {
|
||||
"hashes": [
|
||||
"sha256:0408e3de5ec77cc7f81960c362543cbbd91ef883e3138e81b729fc3eea5b9729",
|
||||
"sha256:0dc56fef0e2c1c470aeac5b6ca8cc7b640bb93e92d9803ddaf9ea03e198f5b0b",
|
||||
"sha256:0e0fe8a0b8312acf3a88077a0802565cb09ee34107813bba1c7cd591fa6cfc8d",
|
||||
"sha256:0f2e3955efea4d1cfbcb87bc321e00dc08d2bcb737fd1d5e398af111d86db5df",
|
||||
"sha256:133e93646ec4300d651839d382d63edff11d8978be23da4cc106f5a18b7d0576",
|
||||
"sha256:1b168f2731796b045128c45982d3a4874057626da0e2ef1fdd722848b741361d",
|
||||
"sha256:1c8a885b370751837c029ef9bc014f27d80840e48bac415f3412e6593bbc18c1",
|
||||
"sha256:1f776e7d669ebceb01dee46484485f43a4048746235e683bcdffacdf1fb4785a",
|
||||
"sha256:1fb2945cbe303b1419e2706e711b7113da57b7db31ee378d08712d678a34e51e",
|
||||
"sha256:20cedb4ee43278bc4f2fee6cb50daec836959aadaf948db5172e776dd3d993fc",
|
||||
"sha256:20ffd184fb1df76a66e34bd1b36b4a4641bd2b82954befa32fe8163e79f1a702",
|
||||
"sha256:26ab906a1eb794cd4e103691daa23d95c6919cc2fa9160000ac02370cc9dd3f6",
|
||||
"sha256:2add28aacc7425117ff6364fe9e06a183bb0251b03f986df0e78e974047571fd",
|
||||
"sha256:2b1e3b80e1d5e52e40e9b924ec43d81570f0e7d09d11081b797bc4692765a3d4",
|
||||
"sha256:31d556d079d72db7c584c0627ff3a24c5d3fb4f730221d3444f3efb1b2514776",
|
||||
"sha256:36b9d05b51e65b254ea6c2585b59d2c4cb91c8a3d91d0ed0f17591a29aaea54a",
|
||||
"sha256:39b0b5d1b6dd03684b3fb276407ebed7090bbec989fa55838c98560c01113b66",
|
||||
"sha256:3cf226acb51d8f1c394c1b310e0e0e61fecdd7adcb78d01e294ac297dd2e7f87",
|
||||
"sha256:3d895d56bd3f82ddd6faaff993c275efc2ff38e52322ea264122d72729dca2b2",
|
||||
"sha256:413540dce94673591859c4c6f794dfeaa845e98bf35d72ed59636f869ef9f86f",
|
||||
"sha256:43e685b9b2341681907759cf3a04e14d7104b3580f808cfde1dfdb60ada85475",
|
||||
"sha256:4cbcb367d44a1f0c2be408758b43e1ffb5308abe0ea222897d6bfc8e8281ef2f",
|
||||
"sha256:551e321c6ba03b55676970b47cb1b73f14a0a4dce6a3e1a9458fd6d921d72e95",
|
||||
"sha256:5572e41282d5268eb09a697c89a7bee84fae66511f87533a6f88bd2f7b652da9",
|
||||
"sha256:5aa48d7c2356055feef06a43611fc401a07337d5b006be13a30f6c58f869e3c3",
|
||||
"sha256:5b5807f3999fb66776dbce568cc9a828544244a8eb84b84b9bafc080c99597b9",
|
||||
"sha256:5e3f639a7a8f10069d0e15408c0b96a2a828cfdec6fca05296ebcdcc28ca7c76",
|
||||
"sha256:685306e2cc7da35be4ee914fd34ab801a6acacb061b6a7abca922aaf9ad368da",
|
||||
"sha256:75c2f8bbddf170e8effc98f5e9084a8751f8174ea6ccf4fca5398436e0320bc8",
|
||||
"sha256:7b438885858efd5be02a9a133caf5812b8776ee0c969fea02c45e8e3f296ba51",
|
||||
"sha256:7d49c66a7d5e56ac959cb6fc583aff0651094ec071ba9ad43df785abc2320d86",
|
||||
"sha256:7d6d9a4aee98fac3eab4952ad1d73aee87359452d1c086b5ceb43ed02ddb16b8",
|
||||
"sha256:84d081fbc252d1b6a982e1870660e7330fb8f90f676f6e78b052ad4e64714bf0",
|
||||
"sha256:8768715ffc41f0008abe25d808c20c3d990f42b6e2e58305d5da280ae7d1fa3b",
|
||||
"sha256:920b1de295e72887bafa3ad9f7a792f811847d57ea6b1215154030cf131f16b1",
|
||||
"sha256:9a08144fa4cba33db5255f9b74f0b89888622109bd2776148f2597447f92a94e",
|
||||
"sha256:a26d7ff68dfdb9f87a016ecfd1e1c2bacbe3108f4e0f8bcd2228ef9a766c787d",
|
||||
"sha256:aa89c3f6c277dd275d8e243ad24f3b5e701491a860d5121f2cdd399fbb31fc9c",
|
||||
"sha256:b5ef256a3fd497d4973c11bf142e9ed78b150d36f5773f1ca6088c230ffc5867",
|
||||
"sha256:b6c78bdf37764092d369722d9946cb65b8767bfa4110f902a1b2542d8d173c8a",
|
||||
"sha256:bbb1b10aa643d973366dc2cb1ad94f99c1726a02343d43cbc011edbfac579e7c",
|
||||
"sha256:c084ad935abe686bd9c898e62a02a19abfc9760b5a79bc29644463eaf2840cb0",
|
||||
"sha256:c73add4bb52a206fd0c0723432db123c0c75c280cbd67174dd9d2db228ebb1b4",
|
||||
"sha256:cae9c19ed12d4e8f3ebf46d1a75090e4c0dc16271c5bce1c833ac168f08fb614",
|
||||
"sha256:d20b797a5c1ad80c516e41bc1fb0443ddb5006e9aaa7bda2d71978346aeb9132",
|
||||
"sha256:d3d1654e11d724760cdb37a3d7691f0be9db5fbdaef59c9f532aabf87006dbaa",
|
||||
"sha256:d878f2a6707cc9d53a1be1414bbb419e629c3d6e67f69230217bb663e76b5087"
|
||||
],
|
||||
"markers": "python_version >= '3.8'",
|
||||
"version": "==2.4.0"
|
||||
},
|
||||
"tortoise-orm": {
|
||||
"hashes": [
|
||||
"sha256:deaabed1619ea8aab6213508dff025571a701b7f34ee534473d7bb7661aa9f4f",
|
||||
@ -1231,11 +1385,11 @@
|
||||
},
|
||||
"tzdata": {
|
||||
"hashes": [
|
||||
"sha256:1a403fada01ff9221ca8044d701868fa132215d84beb92242d9acd2147f667a8",
|
||||
"sha256:b60a638fcc0daffadf82fe0f57e53d06bdec2f36c4df66280ae79bce6bd6f2b9"
|
||||
"sha256:06a47e5700f3081aab02b2e513160914ff0694bce9947d6b76ebd6bf57cfc5d1",
|
||||
"sha256:de39c2ca5dc7b0344f2eba86f49d614019d29f060fc4ebc8a417896a620b56a7"
|
||||
],
|
||||
"markers": "python_version >= '2'",
|
||||
"version": "==2025.2"
|
||||
"version": "==2025.3"
|
||||
},
|
||||
"ujson": {
|
||||
"hashes": [
|
||||
|
||||
589
aaa.json
Normal file
589
aaa.json
Normal file
@ -0,0 +1,589 @@
|
||||
{
|
||||
"source_type": "智联招聘",
|
||||
"name": "深圳市安保医疗科技股份有限公司",
|
||||
"common_name": "深圳市安保医疗科技股份有限公司",
|
||||
"title": "质量体系工程师",
|
||||
"title_addr": "质量体系工程师",
|
||||
"description": "岗位职责:1、负责公司质量管理体系的策划,运营,维护和合规性的管理;2、负责公司各类医疗器械质量管理体系(国内GMP、ISO13485、QSR820、MDSAP、 MDR、ISO9001、 职业健康和安全,环境管理体系等)的运营、维护与升级; 3、负责公司各类外部体系审核工作(国内飞检、CE审核、飞检等)的内部协调与组织工作; 4、负责生产、经营数据上报,及负责监控不良事件上报; 5、负责相关行业...",
|
||||
"education": "本科",
|
||||
"skill": "医疗器械质量管理体系,ISO认证,GMP认证,FDA认证,二三类医械经验",
|
||||
"welfare": "",
|
||||
"years": "3-5年",
|
||||
"salary": "1.2-1.8万",
|
||||
"location": "深圳宝安",
|
||||
"position": "深圳宝安",
|
||||
"job_type": "全职",
|
||||
"size": "500-999人",
|
||||
"employer_type": "民营",
|
||||
"industry": "医疗设备/器械",
|
||||
"job_1st_class": "",
|
||||
"job_2nd_class": "",
|
||||
"job_3rd_class": "",
|
||||
"job_4th_class": "",
|
||||
"date": "2026-01-22 11:23:37",
|
||||
"start_date": "",
|
||||
"end_date": "",
|
||||
"age": "",
|
||||
"sex": "",
|
||||
"number": "1",
|
||||
"url": "http://jobs.zhaopin.com/CC219609310J40829101009.htm",
|
||||
"company_id": "21960931",
|
||||
"company_name": "深圳市安保医疗科技股份有限公司",
|
||||
"company_url": "http://company.zhaopin.com/CC219609310.htm",
|
||||
"company_desc": "<p>深圳市安保医疗科技股份有限公司(简称“安保医疗”)成立于2001年,是国家级高新技术企业、国家级专精特新重点“小巨人”企业,深耕急救与生命支持领域二十余年,致力于为全球医疗系统提供一体化综合解决方案。</p><p><br></p><p>作为国内首家危急重症一体化专业制造商,公司构建了完善的研发创新体系,拥有广东省及深圳市急救和生命支持类医疗设备工程技术研究中心、高端急危重症医疗设备广东省工程研究中心、深圳市高端医疗设备中小试基地、深圳市博士后创新实践基地等多层次科研平台。全面掌握了融合按压、气道管理、机械通气、电除颤等核心技术,实现了关键技术与核心部件的自主可控,推动高端医疗装备的国产化进程。</p><p><br></p><p>目前,安保医疗已上市40余款生命支持设备,涵盖心肺复苏机、急救转运呼吸机、除颤仪等关键产品,拥有近1000项专利技术,多项集成化设备达到国际先进水平,部分技术全球领先。公司牵头承担了十余项国家及省市级重大科研项目,与50余家顶级医院、30余所高校建立深度协同,构建了产学研用一体化的创新生态。</p><p><br></p><p>公司凭借卓越的技术实力与创新能力,先后荣获2023年深圳市科技进步奖一等奖、2018年广东省科技进步奖二等奖、2023年及2024年中国专利奖优秀奖、2024年广东省及深圳市制造业单项冠军等多项荣誉,成为中国高端医疗装备自主创新与国产替代的中坚力量。</p>",
|
||||
"base_data": {
|
||||
"abroadFlag": 2,
|
||||
"abroadTipInfo": {
|
||||
"abroadTips": [],
|
||||
"icon": "",
|
||||
"title": ""
|
||||
},
|
||||
"adResponse": None,
|
||||
"aiPositionRecommendLevel": "",
|
||||
"aiPositionRecommendReason": "",
|
||||
"alreadyCallPhone": False,
|
||||
"applyType": "1",
|
||||
"campusBestCompany": {
|
||||
"bestCompanyUrl": "",
|
||||
"homepageType": 0,
|
||||
"logoTagUrl": "",
|
||||
"state": 0
|
||||
},
|
||||
"campusJobDetail": None,
|
||||
"campusJobMatchData": None,
|
||||
"campusPositionCardTagInfo": None,
|
||||
"campusRootOrgInfo": None,
|
||||
"canBeRegular": False,
|
||||
"canRemoteInternship": False,
|
||||
"cardCustomJson": "{"address":"深圳 宝安 石岩","companyName":"深圳市安保医疗","locationType":"1","salary60":"1.2-1.8万","strengthLabel":"A轮"}",
|
||||
"cardType": 1,
|
||||
"chatWindow": 1,
|
||||
"cityDistrict": "宝安",
|
||||
"cityId": "765",
|
||||
"commercialLabel": [],
|
||||
"commonTrack": {
|
||||
"trackCommercialFeature": "",
|
||||
"trackSocialSearchEmergencyFeature": False
|
||||
},
|
||||
"companyId": 21960931,
|
||||
"companyLogo": "https://rd5-public.zhaopin.cn/imgs/company/043dbec10b9ee8b8104f537b2280e6c4.jpg",
|
||||
"companyName": "深圳市安保医疗科技股份有限公司",
|
||||
"companyNumber": "CZ219609310",
|
||||
"companyRootId": 21960931,
|
||||
"companyScaleTypeTagsNew": [],
|
||||
"companySize": "500-999人",
|
||||
"companyUrl": "http://company.zhaopin.com/CC219609310.htm",
|
||||
"complainFlag": False,
|
||||
"deliveryPath": "",
|
||||
"displayPhoneNumber": False,
|
||||
"distance": 0.0,
|
||||
"distanceFormat": "",
|
||||
"distanceText": "",
|
||||
"education": "本科",
|
||||
"experimentInfo": None,
|
||||
"extend": None,
|
||||
"extensions": None,
|
||||
"featureServer": {
|
||||
"jdViews3d": "38",
|
||||
"lastReplyTime": 1773997359829,
|
||||
"lastReplyTimeText": "",
|
||||
"staffAvgFirstResponseTime7d": 1332,
|
||||
"staffAvgHandleResumeTime30d": 3681,
|
||||
"staffHandleResumeCnts30d": 191,
|
||||
"staffReplyRate30d": 0.48,
|
||||
"todayReplyNum": 0,
|
||||
"todayReplyNumText": ""
|
||||
},
|
||||
"feedOperation": None,
|
||||
"feedPosition": None,
|
||||
"financingStage": {
|
||||
"code": 3,
|
||||
"name": "A轮"
|
||||
},
|
||||
"firstPublishTime": "2026-01-22 11:23:37",
|
||||
"hasAppliedPosition": False,
|
||||
"industryCompanyTags": [
|
||||
"500030000",
|
||||
"1200030000"
|
||||
],
|
||||
"industryName": "医疗设备/器械",
|
||||
"industryTags": [
|
||||
"500210000"
|
||||
],
|
||||
"innerBusinessInfo": {
|
||||
"customIndustryList": [
|
||||
{
|
||||
"itemCode": 500210000,
|
||||
"name": "医疗设备/器械",
|
||||
"standard": True
|
||||
}
|
||||
]
|
||||
},
|
||||
"internshipMonths": 0,
|
||||
"isNewPosition": 0,
|
||||
"jdCardType": 2,
|
||||
"jobDetailData": {
|
||||
"company": {
|
||||
"base": None,
|
||||
"companyAuditNature": None,
|
||||
"companyComment": None,
|
||||
"companyInterview": None,
|
||||
"jumpDetail": None,
|
||||
"orgBestRanking": None,
|
||||
"orgReliableCompany": None,
|
||||
"other": None,
|
||||
"state": None
|
||||
},
|
||||
"companyProxy": {
|
||||
"companyAddress": "",
|
||||
"companyImage": "",
|
||||
"companyName": "",
|
||||
"companySize": "",
|
||||
"entryCompanyTitle": ""
|
||||
},
|
||||
"customAttributeInfo": {
|
||||
"platformRemind": "",
|
||||
"reportItems": [],
|
||||
"welfareItems": [],
|
||||
"workTimeItems": []
|
||||
},
|
||||
"debug": {},
|
||||
"experimentInfo": {
|
||||
"blueCollarJobTitleExperimentInfo": None
|
||||
},
|
||||
"featureServer": None,
|
||||
"imSection": None,
|
||||
"internship": [],
|
||||
"live": {
|
||||
"liveItems": [],
|
||||
"liveQuickFocusChecked": 0,
|
||||
"liveQuickFocusState": 0,
|
||||
"recommendLiveList": [],
|
||||
"state": 0
|
||||
},
|
||||
"operationSection": {
|
||||
"topJobBannerArea": None
|
||||
},
|
||||
"partTime": [],
|
||||
"position": {
|
||||
"base": {
|
||||
"deliveryPath": "",
|
||||
"education": "本科",
|
||||
"educationCode": "",
|
||||
"maxSalary": "",
|
||||
"minSalary": "",
|
||||
"positionId": 40829101009,
|
||||
"positionName": "质量体系工程师",
|
||||
"positionNumber": "CC219609310J40829101009",
|
||||
"positionUrl": "",
|
||||
"positionWorkingExp": "3-5年",
|
||||
"positionWorkingExpCode": "",
|
||||
"propertyType": "",
|
||||
"salary": "1.2-1.8万",
|
||||
"salaryReal": "",
|
||||
"workType": "全职"
|
||||
},
|
||||
"date": {
|
||||
"dateEnd": "",
|
||||
"dateStart": "",
|
||||
"firstPublishTime": "",
|
||||
"positionPublishTime": "",
|
||||
"positionUpdateTime": "",
|
||||
"positionUpdateTimeText": ""
|
||||
},
|
||||
"desc": {
|
||||
"description": "岗位职责:\n1、负责公司质量管理体系的策划,运营,维护和合规性的管理;\n2、负责公司各类医疗器械质量管理体系(国内GMP、ISO13485、QSR820、MDSAP、 MDR、ISO9001、 职业健康和安全,环境管理体系等)的运营、维护与升级; \n3、负责公司各类外部体系审核工作(国内飞检、CE审核、飞检等)的内部协调与组织工作; \n4、负责生产、经营数据上报,及负责监控不良事件上报; \n5、负责相关行业法律法规的收集、整理、受控及内部传递;\n6、协助质量经理做好其他部门工作。\n岗位要求:\n1.本科及以上学历,大学英文4级以上的阅读能力,专业不限;\n2.至少三年以上二类有源或三类有源医疗器械质量体系管理经验;\n3.有医疗器械内审员资格证,有GCP资格证优先;\n4.熟悉MDSAP,或MDR2017/745优先。",
|
||||
"descriptionHighlight": "",
|
||||
"highlightLabels": [],
|
||||
"labels": [
|
||||
"医疗器械质量管理体系",
|
||||
"ISO认证",
|
||||
"GMP认证",
|
||||
"FDA认证",
|
||||
"二三类医械经验",
|
||||
"医疗设备/器械"
|
||||
],
|
||||
"performanceBonus": "",
|
||||
"welfareLabel": [],
|
||||
"welfareTags": []
|
||||
},
|
||||
"jobType": {
|
||||
"jobType": "",
|
||||
"jobTypeLevel": "15000400000000",
|
||||
"jobTypeLevelName": "",
|
||||
"subJobType": "",
|
||||
"subJobTypeLevel": "15000400170000",
|
||||
"subJobTypeLevelName": ""
|
||||
},
|
||||
"onlineCarHailingExtend": {
|
||||
"gray": False,
|
||||
"promiseGuarantee": ""
|
||||
},
|
||||
"onlineCarInfo": [],
|
||||
"other": {
|
||||
"customJobGroup": "DEFAULT",
|
||||
"deliveredPreviouslyTip": "",
|
||||
"jobKeyword": {
|
||||
"keywords": []
|
||||
},
|
||||
"jobSkillTags": [],
|
||||
"jobTypeIsBlueCollar": False,
|
||||
"overseasList": [],
|
||||
"pageStyle": 0,
|
||||
"positionCommercialLabel": [],
|
||||
"positionHighlight": "",
|
||||
"propertyTypeUrl": "",
|
||||
"rpoProxyDisplayOrgName": "",
|
||||
"urgentRecruitmentUrl": ""
|
||||
},
|
||||
"preferredHrInfo": {
|
||||
"icon": "",
|
||||
"introduce": "",
|
||||
"jumpUrl": "",
|
||||
"preferredHr": False
|
||||
},
|
||||
"todayInterview": None,
|
||||
"workLocation": {
|
||||
"address": "工作地点:宝安区 · 石岩",
|
||||
"addressType": 0,
|
||||
"latitude": "22.6397107496406",
|
||||
"longitude": "113.92158006824309",
|
||||
"positionCityDistrict": "",
|
||||
"positionCityDistrictCode": "",
|
||||
"positionCityId": "765",
|
||||
"positionWorkCity": "",
|
||||
"showMap": True,
|
||||
"showMultiAddressesTip": "",
|
||||
"staticMapUrl": "https://storage-public.zhaopin.cn/job/share/1772075366278481083/6e4958065143498cad2bbfc1aef40d83",
|
||||
"streetName": "",
|
||||
"tradingArea": "",
|
||||
"travelMode": "bus",
|
||||
"verifyTheTruthUrl": "",
|
||||
"workAddress": "深圳宝安创维创新谷-8号楼"
|
||||
}
|
||||
},
|
||||
"proxyWarning": None,
|
||||
"recommender": {
|
||||
"avatar": "",
|
||||
"name": "",
|
||||
"state": 0,
|
||||
"text": "",
|
||||
"title": ""
|
||||
},
|
||||
"secure": {
|
||||
"abroadLabel": "",
|
||||
"abroadTipInfo": None,
|
||||
"safeCenter": None,
|
||||
"safetyReminder": None
|
||||
},
|
||||
"staff": {
|
||||
"activityLevel": [
|
||||
"高回复率"
|
||||
],
|
||||
"auditNaturePrompt": None,
|
||||
"authenticationState": 0,
|
||||
"avatar": "https://storage-public.zhaopin.cn/zp/clouddisk/1771900452818918625/fee00116-221c-49df-a42a-6fb6f9cfcd09.png?x-oss-process=image/resize,l_240/rotate,0",
|
||||
"companyName": "深圳市安保医疗科技股份有限公司",
|
||||
"goldMedalInterviewer": None,
|
||||
"greeting": "",
|
||||
"greetingHasDelivery": "",
|
||||
"hrJob": "HR",
|
||||
"hrOnlineIocState": 0,
|
||||
"hrOnlineState": "",
|
||||
"hrResumeOperationState": "",
|
||||
"hrStateInfo": "",
|
||||
"id": 1168020727,
|
||||
"lastOnlineTime": 0,
|
||||
"lastOnlineTimeText": "",
|
||||
"modularState": 1,
|
||||
"other": {
|
||||
"freeTag": None,
|
||||
"tagUrl": ""
|
||||
},
|
||||
"positionDetailStaffQuickReply": None,
|
||||
"staffName": "陈女士"
|
||||
},
|
||||
"stateInfo": {
|
||||
"deliveryAfterGuide": None,
|
||||
"imSessionInfoDetail": {
|
||||
"imChatStatus": 0,
|
||||
"imChatStatusForChatBeforeDelivery": 0,
|
||||
"imDeliveryTitle": "",
|
||||
"referType": -1
|
||||
},
|
||||
"positionBehaviorState": {
|
||||
"deliveryState": 0,
|
||||
"favoriteState": 0,
|
||||
"followHrState": 0,
|
||||
"imReplyState": -1,
|
||||
"negativeState": 0,
|
||||
"sessionChatState": 0
|
||||
},
|
||||
"signUpStatusInfo": {},
|
||||
"state": {
|
||||
"abroadFlag": 2,
|
||||
"applyType": "",
|
||||
"callProcess": "",
|
||||
"hasAppliedPosition": False,
|
||||
"positionDeliveryType": "",
|
||||
"positionSourceType": 1,
|
||||
"workMode": "ONSITE",
|
||||
"workModeDesc": ""
|
||||
},
|
||||
"useNewAfterDeliveryStyle": False
|
||||
},
|
||||
"verifyTheTruth": None,
|
||||
"verifyTrueFeedback": None
|
||||
},
|
||||
"jobHitReason": "",
|
||||
"jobHitReasonHighlights": [],
|
||||
"jobId": 40829101009,
|
||||
"jobKeyword": {
|
||||
"keywords": [
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
},
|
||||
{
|
||||
"itemValue": ""
|
||||
}
|
||||
]
|
||||
},
|
||||
"jobKnowledgeWelfareFeatures": [],
|
||||
"jobPostingTime": 1769052217250,
|
||||
"jobRootOrgInfo": {
|
||||
"cityName": "深圳",
|
||||
"reviewOrgNature": 1
|
||||
},
|
||||
"jobSkillTags": [
|
||||
{
|
||||
"id": 19824127,
|
||||
"name": "医疗器械质量管理体系",
|
||||
"standard": False
|
||||
},
|
||||
{
|
||||
"id": 19381048,
|
||||
"name": "ISO认证",
|
||||
"standard": False
|
||||
},
|
||||
{
|
||||
"id": 18958610,
|
||||
"name": "GMP认证",
|
||||
"standard": False
|
||||
},
|
||||
{
|
||||
"id": 19366784,
|
||||
"name": "FDA认证",
|
||||
"standard": False
|
||||
},
|
||||
{
|
||||
"id": 488082095,
|
||||
"name": "二三类医械经验",
|
||||
"standard": False
|
||||
}
|
||||
],
|
||||
"jobSummary": "岗位职责:1、负责公司质量管理体系的策划,运营,维护和合规性的管理;2、负责公司各类医疗器械质量管理体系(国内GMP、ISO13485、QSR820、MDSAP、 MDR、ISO9001、 职业健康和安全,环境管理体系等)的运营、维护与升级; 3、负责公司各类外部体系审核工作(国内飞检、CE审核、飞检等)的内部协调与组织工作; 4、负责生产、经营数据上报,及负责监控不良事件上报; 5、负责相关行业...",
|
||||
"liveCard": {
|
||||
"icon": "",
|
||||
"liveState": 0,
|
||||
"liveTips": "",
|
||||
"roomId": 0,
|
||||
"startTimeFormat": "",
|
||||
"videoUrl": ""
|
||||
},
|
||||
"matchInfo": {
|
||||
"icon": "",
|
||||
"matched": 0,
|
||||
"tagState": 0
|
||||
},
|
||||
"menVipLevel": 0,
|
||||
"name": "质量体系工程师",
|
||||
"needMajor": [],
|
||||
"number": "CC219609310J40829101009",
|
||||
"orgBestEmployerFlag": 1,
|
||||
"orgCommercialTags": [],
|
||||
"orgPayedFlag": 1,
|
||||
"payload": {
|
||||
"name": "",
|
||||
"partition": "",
|
||||
"score": "",
|
||||
"weight": ""
|
||||
},
|
||||
"positionCommercialLabel": [],
|
||||
"positionExpandCardData": "",
|
||||
"positionExpandCardType": 0,
|
||||
"positionHighlight": "",
|
||||
"positionOfNlp": 1,
|
||||
"positionSourceType": 1,
|
||||
"positionSourceTypeUrl": "",
|
||||
"positionURL": "http://jobs.zhaopin.com/CC219609310J40829101009.htm",
|
||||
"positionUrl": "http://jobs.zhaopin.com/CC219609310J40829101009.htm",
|
||||
"property": "民营",
|
||||
"propertyCode": "5",
|
||||
"propertyName": "民营",
|
||||
"propertyType": "",
|
||||
"propertyTypeUrl": "",
|
||||
"provideInternshipCertificate": False,
|
||||
"proxyModel": {
|
||||
"proxiedOrgName": "",
|
||||
"proxiedOrgSize": "",
|
||||
"recruitPosition": 0
|
||||
},
|
||||
"publishTime": "2026-03-05 09:51:08",
|
||||
"recallSign": {
|
||||
"gMethod": "config-position_search-position_mbscore-ANONYMOUS-welfare-words",
|
||||
"gParam": "query-ps-mbscore-3",
|
||||
"gQuery": "query-ps-mbscore-3",
|
||||
"gSort": "query-ps-mbscore-3",
|
||||
"gSource": "solr.source_position_query",
|
||||
"gWeight": 0
|
||||
},
|
||||
"recruitNumber": 1,
|
||||
"redirectUrl": "",
|
||||
"redirectable": False,
|
||||
"rootCompanyNumber": "CZ219609310",
|
||||
"rpoProxied": False,
|
||||
"rpoProxy": False,
|
||||
"salary60": "1.2-1.8万",
|
||||
"salaryCount": "",
|
||||
"salaryReal": "12001-18000",
|
||||
"salaryType": 1,
|
||||
"searchTagList": [],
|
||||
"securityAddressLabel": "",
|
||||
"settlementType": "",
|
||||
"showDistance": 0,
|
||||
"showSkillTags": [
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "3-5年"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "本科"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "医疗器械质量管理体系"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "ISO认证"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "GMP认证"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "FDA认证"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "二三类医械经验"
|
||||
},
|
||||
{
|
||||
"highlightBackGroundColor": "",
|
||||
"highlightWordColor": "",
|
||||
"tag": "医疗设备/器械"
|
||||
}
|
||||
],
|
||||
"skillLabel": [
|
||||
{
|
||||
"state": 0,
|
||||
"value": "医疗器械质量管理体系"
|
||||
},
|
||||
{
|
||||
"state": 0,
|
||||
"value": "ISO认证"
|
||||
},
|
||||
{
|
||||
"state": 0,
|
||||
"value": "GMP认证"
|
||||
},
|
||||
{
|
||||
"state": 0,
|
||||
"value": "FDA认证"
|
||||
},
|
||||
{
|
||||
"state": 0,
|
||||
"value": "二三类医械经验"
|
||||
}
|
||||
],
|
||||
"skillLabelPersonality": "",
|
||||
"staffCard": {
|
||||
"authenticationState": 0,
|
||||
"avatar": "https://storage-public.zhaopin.cn/zp/clouddisk/1771900452818918625/fee00116-221c-49df-a42a-6fb6f9cfcd09.png?x-oss-process=image/resize,l_240/rotate,0",
|
||||
"goldMedalInterviewer": {
|
||||
"goldMedalInterviewer": False,
|
||||
"interviewerImageUrl": "",
|
||||
"interviewerTitle": ""
|
||||
},
|
||||
"hrCompanyName": "",
|
||||
"hrJob": "HR",
|
||||
"hrOnlineIocState": 0,
|
||||
"hrOnlineState": "三日内活跃",
|
||||
"hrStateInfo": "高回复率",
|
||||
"id": 1168020727,
|
||||
"lastOnlineTime": 1774084085367,
|
||||
"lastOnlineTimeText": "",
|
||||
"staffName": "陈女士"
|
||||
},
|
||||
"streetId": 44030605,
|
||||
"streetName": "石岩",
|
||||
"subJobTypeLevel": "15000400170000",
|
||||
"subJobTypeLevelName": "质量体系工程师",
|
||||
"subways": [],
|
||||
"tagABC": "",
|
||||
"tagList": [],
|
||||
"todayInterview": False,
|
||||
"todayInterviewImageUrl": "",
|
||||
"topLabel": None,
|
||||
"tradingArea": "",
|
||||
"volcanoMeterial": None,
|
||||
"weeklyInternshipDays": 0,
|
||||
"welfareLabel": [],
|
||||
"welfareTagList": [],
|
||||
"workCity": "深圳",
|
||||
"workDateType": "",
|
||||
"workMode": "",
|
||||
"workType": "全职",
|
||||
"workTypeCode": "2",
|
||||
"workingExp": "3-5年"
|
||||
}
|
||||
}
|
||||
197
app/CLAUDE.md
Normal file
197
app/CLAUDE.md
Normal file
@ -0,0 +1,197 @@
|
||||
[根目录](../CLAUDE.md) > **app**
|
||||
|
||||
# app - FastAPI 后端模块
|
||||
|
||||
## 模块职责
|
||||
|
||||
提供 JobData 平台的 REST API 服务,包含:用户/角色/权限/菜单/部门管理(RBAC),招聘数据入库、查询、清洗与分析,Token 与代理 IP 管理,定时任务调度,以及审计日志记录。
|
||||
|
||||
---
|
||||
|
||||
## 入口与启动
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `run.py`(根目录) | `uvicorn` 启动入口,读取 `APP_HOST`/`APP_PORT`/`UVICORN_WORKERS` 环境变量 |
|
||||
| `app/__init__.py` | FastAPI 应用工厂 `create_app()`,注册中间件、异常处理器、路由,以及 lifespan 钩子 |
|
||||
| `app/core/init_app.py` | lifespan 内部逻辑:DB 迁移、种子数据、ClickHouse 初始化 |
|
||||
| `app/core/scheduler.py` | APScheduler 启动与任务注册 |
|
||||
|
||||
### 启动顺序
|
||||
|
||||
1. Tortoise-ORM 连接 MySQL,生成 schema
|
||||
2. 按环境变量执行数据库迁移(Aerich)
|
||||
3. 初始化种子数据(超级管理员、菜单、API、角色)
|
||||
4. 初始化 ClickHouse 表/视图(可选)
|
||||
5. APScheduler 启动定时任务
|
||||
6. FastAPI 开始接受请求
|
||||
|
||||
---
|
||||
|
||||
## 对外接口
|
||||
|
||||
API 前缀:`/api/v1`,完整路由注册见 `app/api/v1/__init__.py`。
|
||||
|
||||
| 路由前缀 | 标签 | 权限 | 说明 |
|
||||
|----------|------|------|------|
|
||||
| `/base` | 基础模块 | 无 | 登录、获取用户信息、菜单树 |
|
||||
| `/user` | 用户管理 | DependPermission | 用户 CRUD |
|
||||
| `/role` | 角色管理 | DependPermission | 角色 CRUD、菜单/API 分配 |
|
||||
| `/menu` | 菜单管理 | DependPermission | 菜单树 CRUD |
|
||||
| `/api` | API 管理 | DependPermission | 接口注册与权限管理 |
|
||||
| `/dept` | 部门管理 | DependPermission | 部门树 CRUD |
|
||||
| `/auditlog` | 审计日志 | DependPermission | 操作日志查询 |
|
||||
| `/job` & `/universal` | 数据入库/通用数据接口 | 无鉴权(内部调用) | 职位/公司数据批量入库 |
|
||||
| `/token` | Token 管理 | 无鉴权 | Boss Token CRUD |
|
||||
| `/proxy` | 代理 IP 管理 | DependPermission | 代理池管理 |
|
||||
| `/stats` | 数据统计 | 无 | 各平台数据量统计 |
|
||||
| `/pipeline` | 流水线 | 无 | 触发 ECS pipeline |
|
||||
| `/keyword` | 关键词管理 | 无 | 爬虫关键词(城市+职位)管理 |
|
||||
| `/cleaning` | 数据清理 | DependPermission | 定向清洗操作 |
|
||||
| `/analytics` | 数据分析 | 无 | 趋势、来源分布统计 |
|
||||
| `/company` | 公司搜索 | 无 | 公司信息查询 |
|
||||
|
||||
**认证机制**:JWT(HS256,有效期 7 天),通过 `DependPermission` 依赖注入检查路由级别权限。
|
||||
|
||||
---
|
||||
|
||||
## 关键依赖与配置
|
||||
|
||||
配置集中在 `app/settings/config.py`(`pydantic-settings.BaseSettings`,支持环境变量覆盖):
|
||||
|
||||
```python
|
||||
# 关键字段(需通过环境变量覆盖)
|
||||
SECRET_KEY = "CHANGE_ME_DEV_ONLY" # JWT 签名密钥
|
||||
TORTOISE_ORM.connections.default # MySQL 连接串(含密码)
|
||||
CLICKHOUSE_HOST / USER / PASS # ClickHouse 连接
|
||||
SMTP_USER / SMTP_PASS # 邮件凭据
|
||||
```
|
||||
|
||||
**中间件链**(从外到内):
|
||||
1. `CORSMiddleware` - 跨域(默认允许 `http://localhost:5173`)
|
||||
2. `BackGroundTaskMiddleware` - 后台任务支持
|
||||
3. `HttpAuditLogMiddleware` - HTTP 审计日志(排除登录接口)
|
||||
4. `IpTrackingMiddleware` - IP 请求追踪
|
||||
|
||||
---
|
||||
|
||||
## 数据模型
|
||||
|
||||
### MySQL(Tortoise-ORM)
|
||||
|
||||
| 表 | 模型文件 | 说明 |
|
||||
|----|----------|------|
|
||||
| `user` | `app/models/admin.py` | 用户(含角色多对多) |
|
||||
| `role` | `app/models/admin.py` | 角色(含菜单、API 多对多) |
|
||||
| `api` | `app/models/admin.py` | 接口注册表 |
|
||||
| `menu` | `app/models/admin.py` | 菜单树(parent_id 自引用) |
|
||||
| `dept` | `app/models/admin.py` | 部门树 + 闭包表 |
|
||||
| `auditlog` | `app/models/admin.py` | HTTP 操作审计 |
|
||||
| `boss_token` | `app/models/token.py` | Boss 直聘登录 Token |
|
||||
| `cleaning_*` | `app/models/cleaning.py` | 数据清洗任务状态 |
|
||||
| `scheduled_task_run` / `stats_total` | `app/models/metrics.py` | 定时任务运行记录与统计汇总 |
|
||||
|
||||
### ClickHouse(原始数据存储)
|
||||
|
||||
| 表/视图 | 引擎 | 说明 |
|
||||
|---------|------|------|
|
||||
| `boss_job` | MergeTree | Boss 职位原始 JSON,`job_id` 去重 |
|
||||
| `boss_company` | MergeTree | Boss 公司原始 JSON,`company_name` 去重 |
|
||||
| `qcwy_job` | MergeTree | 前程无忧职位,`job_id + update_date_time` 去重 |
|
||||
| `qcwy_company` | MergeTree | 前程无忧公司 |
|
||||
| `zhilian_job` | MergeTree | 智联招聘职位,`number + first_publish_time` 去重 |
|
||||
| `zhilian_company` | MergeTree | 智联招聘公司 |
|
||||
| `pending_company` | ReplacingMergeTree | 待处理公司队列,`(source, company_id)` 去重 |
|
||||
| `job_analytics` | VIEW | 三平台统一分析视图(UNION ALL) |
|
||||
|
||||
ClickHouse 表结构在 `app/core/clickhouse_init.py` 中通过 `CREATE TABLE IF NOT EXISTS` 管理。
|
||||
|
||||
---
|
||||
|
||||
## 核心服务
|
||||
|
||||
| 服务文件 | 职责 |
|
||||
|----------|------|
|
||||
| `app/services/cleaning.py` | `CleaningService`:多平台定向清洗(URL/ID/公司名/公司ID),自动识别平台 |
|
||||
| `app/services/company_cleaner.py` | 公司数据自动清洗:collect 待处理 → process → 入库 |
|
||||
| `app/services/analytics_service.py` | `AnalyticsService`:封装 ClickHouse 分析查询 |
|
||||
| `app/services/job.py` | `DataRouterService`:数据路由入库(去重逻辑) |
|
||||
| `app/services/ingest_service.py` | 批量数据摄入 |
|
||||
| `app/services/crawler/boss.py` | Boss 爬虫 Service 封装(HTTP 层) |
|
||||
| `app/services/crawler/qcwy.py` | 前程无忧爬虫 Service |
|
||||
| `app/services/crawler/zhilian.py` | 智联招聘爬虫 Service |
|
||||
| `app/repositories/clickhouse_repo.py` | ClickHouse Repository(`ClickHouseBaseRepo` + `JobAnalyticsRepo`) |
|
||||
| `app/core/scheduler.py` | 定时任务:stats、ip_alert、ecs_pipeline、company_cleaning、daily_cleanup |
|
||||
| `app/core/locks.py` | `DistributedLock`:基于文件/Redis 的分布式锁,防多 Worker 重复执行 |
|
||||
| `app/core/algorithms/antispider.py` | 反爬虫算法(签名生成等) |
|
||||
|
||||
---
|
||||
|
||||
## 测试与质量
|
||||
|
||||
- 当前无测试文件,属于主要缺口。
|
||||
- 代码质量工具:`ruff`(lint)、`black`(格式)、`isort`(导入排序)。
|
||||
- 建议优先补充的测试:
|
||||
- `CleaningService.clean_target_auto()` 的平台识别逻辑
|
||||
- `DataRouterService.store_data()` 的去重逻辑
|
||||
- `app/api/v1/analytics.py` 接口集成测试
|
||||
|
||||
---
|
||||
|
||||
## 常见问题 (FAQ)
|
||||
|
||||
**Q: 启动报 ClickHouse 连接失败?**
|
||||
A: 检查 `CLICKHOUSE_HOST` 环境变量,或在 `config.py` 中将 `CLICKHOUSE_HOST` 置为空字符串跳过初始化。
|
||||
|
||||
**Q: 多 Worker 下任务重复执行?**
|
||||
A: 通过文件锁(`.startup_lock` 目录)和 `DistributedLock` 保护,若 Worker 异常退出可能导致锁残留,手动删除 `.startup_lock` 目录即可。
|
||||
|
||||
**Q: 新增 API 接口后权限不生效?**
|
||||
A: 在路由文件中注册路由后,重启应用会触发 `api_controller.refresh_api()` 自动扫描 FastAPI 路由表并更新 `api` 表,然后在角色管理中分配权限。
|
||||
|
||||
---
|
||||
|
||||
## 相关文件清单
|
||||
|
||||
```
|
||||
app/
|
||||
├── __init__.py # 应用工厂 create_app()
|
||||
├── settings/config.py # 全局配置(Settings)
|
||||
├── api/v1/__init__.py # 路由聚合
|
||||
├── api/v1/analytics.py # 数据分析接口
|
||||
├── api/v1/cleaning/ # 数据清理接口
|
||||
├── api/v1/job/ # 数据入库接口
|
||||
├── api/v1/keyword/ # 关键词管理接口
|
||||
├── api/v1/company/ # 公司搜索接口
|
||||
├── controllers/ # 业务控制器(CRUD 封装)
|
||||
├── core/
|
||||
│ ├── init_app.py # lifespan 初始化
|
||||
│ ├── scheduler.py # APScheduler 定时任务
|
||||
│ ├── clickhouse.py # ClickHouse 连接管理
|
||||
│ ├── clickhouse_init.py # ClickHouse 表/视图 DDL
|
||||
│ ├── locks.py # 分布式锁
|
||||
│ ├── middlewares.py # 中间件
|
||||
│ └── algorithms/ # 签名/反爬虫算法
|
||||
├── models/
|
||||
│ ├── admin.py # User, Role, Api, Menu, Dept, AuditLog
|
||||
│ ├── token.py # BossToken
|
||||
│ ├── metrics.py # ScheduledTaskRun, StatsTotal
|
||||
│ └── cleaning.py # 清洗任务状态
|
||||
├── repositories/
|
||||
│ └── clickhouse_repo.py # ClickHouse 查询仓库
|
||||
├── services/
|
||||
│ ├── cleaning.py # CleaningService
|
||||
│ ├── company_cleaner.py # 公司自动清洗
|
||||
│ ├── analytics_service.py # 数据分析 Service
|
||||
│ ├── job.py # DataRouterService(数据入库路由)
|
||||
│ └── crawler/ # 各平台爬虫 Service 封装
|
||||
└── schemas/ # Pydantic 请求/响应 Schema
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 变更记录 (Changelog)
|
||||
|
||||
| 日期 | 说明 |
|
||||
|------|------|
|
||||
| 2026-03-20 | 初始化模块文档 |
|
||||
@ -14,6 +14,7 @@ from app.core.init_app import (
|
||||
)
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.core.scheduler import start_scheduler, shutdown_scheduler
|
||||
from app.services.ingest.remote_push import close_http_client
|
||||
|
||||
try:
|
||||
from app.settings.config import settings
|
||||
@ -28,7 +29,8 @@ async def lifespan(app: FastAPI):
|
||||
await init_data()
|
||||
start_scheduler()
|
||||
yield
|
||||
# 清理所有数据库连接
|
||||
# 清理所有连接
|
||||
await close_http_client()
|
||||
await Tortoise.close_connections()
|
||||
await clickhouse_manager.close()
|
||||
shutdown_scheduler()
|
||||
|
||||
@ -29,8 +29,9 @@ v1_router.include_router(menus_router, prefix="/menu", dependencies=[DependPermi
|
||||
v1_router.include_router(apis_router, prefix="/api", dependencies=[DependPermission])
|
||||
v1_router.include_router(depts_router, prefix="/dept", dependencies=[DependPermission])
|
||||
v1_router.include_router(auditlog_router, prefix="/auditlog", dependencies=[DependPermission])
|
||||
v1_router.include_router(job_router, prefix="/ingest", tags=["数据入库"])
|
||||
v1_router.include_router(job_router, prefix="/job", tags=["数据入库"])
|
||||
v1_router.include_router(job_router, prefix="/universal", tags=["通用数据接口"])
|
||||
v1_router.include_router(job_router, prefix="/universal", tags=["数据入库"])
|
||||
v1_router.include_router(token_router, prefix="/token", tags=["Token管理"])
|
||||
v1_router.include_router(proxy_router, prefix="/proxy", tags=["代理IP管理"])
|
||||
v1_router.include_router(stats_router, prefix="/stats")
|
||||
|
||||
@ -1,9 +1,6 @@
|
||||
from typing import Optional, List
|
||||
from typing import Optional
|
||||
from datetime import datetime, date, timezone
|
||||
try:
|
||||
from zoneinfo import ZoneInfo
|
||||
except ImportError:
|
||||
from backports.zoneinfo import ZoneInfo
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
@ -30,24 +27,28 @@ async def get_overview(
|
||||
from_date: Optional[date] = None,
|
||||
to_date: Optional[date] = None,
|
||||
city: Optional[str] = None,
|
||||
channel: Optional[str] = None,
|
||||
service: AnalyticsService = Depends(get_analytics_service)
|
||||
):
|
||||
from_dt = to_utc(datetime.combine(from_date, datetime.min.time())) if from_date else None
|
||||
to_dt = to_utc(datetime.combine(to_date, datetime.max.time())) if to_date else None
|
||||
|
||||
|
||||
filters = {}
|
||||
if city:
|
||||
filters["city"] = city
|
||||
|
||||
if channel:
|
||||
filters["channel"] = channel
|
||||
|
||||
return await service.get_job_statistics(filters=filters, from_dt=from_dt, to_dt=to_dt)
|
||||
|
||||
@router.get("/trend/volume", summary="获取数据量趋势")
|
||||
async def get_volume_trend(
|
||||
interval: str = Query("day", regex="^(day|hour|week|month)$"),
|
||||
interval: str = Query("day", pattern="^(day|hour|week|month)$"),
|
||||
from_date: Optional[date] = None,
|
||||
to_date: Optional[date] = None,
|
||||
from_datetime: Optional[datetime] = None,
|
||||
to_datetime: Optional[datetime] = None,
|
||||
channel: Optional[str] = None,
|
||||
service: AnalyticsService = Depends(get_analytics_service)
|
||||
):
|
||||
# 兼容小时粒度的精确时间窗口,其它粒度按日期转换为起止时间
|
||||
@ -65,7 +66,10 @@ async def get_volume_trend(
|
||||
else:
|
||||
to_dt = None
|
||||
|
||||
return await service.get_volume_trend(interval=interval, from_dt=from_dt, to_dt=to_dt)
|
||||
filters = {}
|
||||
if channel:
|
||||
filters["channel"] = channel
|
||||
return await service.get_volume_trend(interval=interval, filters=filters, from_dt=from_dt, to_dt=to_dt)
|
||||
|
||||
@router.get("/distribution/source", summary="获取数据来源分布")
|
||||
async def get_source_distribution(
|
||||
@ -73,6 +77,7 @@ async def get_source_distribution(
|
||||
to_date: Optional[date] = None,
|
||||
from_datetime: Optional[datetime] = None,
|
||||
to_datetime: Optional[datetime] = None,
|
||||
channel: Optional[str] = None,
|
||||
service: AnalyticsService = Depends(get_analytics_service)
|
||||
):
|
||||
if from_datetime:
|
||||
@ -89,4 +94,7 @@ async def get_source_distribution(
|
||||
else:
|
||||
to_dt = None
|
||||
|
||||
return await service.get_source_distribution(from_dt=from_dt, to_dt=to_dt)
|
||||
filters = {}
|
||||
if channel:
|
||||
filters["channel"] = channel
|
||||
return await service.get_source_distribution(filters=filters, from_dt=from_dt, to_dt=to_dt)
|
||||
|
||||
@ -1,59 +1,91 @@
|
||||
from datetime import datetime
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, File, UploadFile, Form, Body, Query
|
||||
from tortoise.expressions import Q
|
||||
from tortoise.functions import Count, Sum
|
||||
|
||||
from app.models.cleaning import CleaningTask
|
||||
from app.models.company import CompanyCleaningQueue
|
||||
from app.schemas import Success, SuccessExtra
|
||||
from app.services.cleaning import CleaningService
|
||||
from app.services.company_cleaner import company_cleaner
|
||||
from app.services.company_storage import company_storage, normalize_company_id
|
||||
from app.controllers.cleaning import cleaning_controller
|
||||
from app.schemas import Success, SuccessExtra
|
||||
from app.models.cleaning import CleaningTask
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from tortoise.expressions import Q
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
router = APIRouter()
|
||||
cleaning_service = CleaningService()
|
||||
COMPANY_SOURCES = ("boss", "qcwy", "zhilian")
|
||||
|
||||
@router.get("/stats", summary="获取公司清洗统计信息")
|
||||
async def get_stats():
|
||||
"""获取 ClickHouse 中待处理公司的统计信息"""
|
||||
client = await clickhouse_manager.get_client()
|
||||
|
||||
pending_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'pending'"
|
||||
pending_res = await client.query(pending_sql)
|
||||
pending_count = pending_res.result_rows[0][0] if pending_res.result_rows else 0
|
||||
|
||||
today_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'done' AND toDate(updated_at) = today()"
|
||||
today_res = await client.query(today_sql)
|
||||
today_count = today_res.result_rows[0][0] if today_res.result_rows else 0
|
||||
|
||||
dist_sql = """
|
||||
SELECT source, status, count()
|
||||
FROM job_data.pending_company FINAL
|
||||
GROUP BY source, status
|
||||
ORDER BY source, status
|
||||
"""
|
||||
dist_res = await client.query(dist_sql)
|
||||
|
||||
"""获取 MySQL 中待处理公司的统计信息"""
|
||||
pending_count = await CompanyCleaningQueue.filter(status="pending").count()
|
||||
today_count = await CompanyCleaningQueue.filter(
|
||||
status="done",
|
||||
updated_at__gte=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0),
|
||||
).count()
|
||||
dist_rows = await CompanyCleaningQueue.annotate(total=Count("id")).group_by("source", "status").values(
|
||||
"source",
|
||||
"status",
|
||||
"total",
|
||||
)
|
||||
|
||||
stats = {
|
||||
"total_pending": pending_count,
|
||||
"today_processed": today_count,
|
||||
"details": []
|
||||
}
|
||||
|
||||
# Process distribution
|
||||
source_stats = {}
|
||||
for row in dist_res.result_rows:
|
||||
source, status, count = row
|
||||
|
||||
source_stats = {
|
||||
source: {
|
||||
"pending": 0,
|
||||
"processing": 0,
|
||||
"done": 0,
|
||||
"failed": 0,
|
||||
"total": 0,
|
||||
"jobs_fetched": 0,
|
||||
"jobs_stored": 0,
|
||||
"jobs_duplicate": 0,
|
||||
"jobs_failed": 0,
|
||||
}
|
||||
for source in COMPANY_SOURCES
|
||||
}
|
||||
for row in dist_rows:
|
||||
source = row["source"]
|
||||
status = row["status"]
|
||||
count = row["total"]
|
||||
if source not in source_stats:
|
||||
source_stats[source] = {"pending": 0, "done": 0, "failed": 0, "total": 0}
|
||||
|
||||
continue
|
||||
|
||||
if status in source_stats[source]:
|
||||
source_stats[source][status] = count
|
||||
source_stats[source]["total"] += count
|
||||
|
||||
stats["details"] = [
|
||||
{"source": k, **v} for k, v in source_stats.items()
|
||||
]
|
||||
|
||||
|
||||
job_dist_rows = await CompanyCleaningQueue.annotate(
|
||||
jobs_fetched_sum=Sum("jobs_fetched"),
|
||||
jobs_stored_sum=Sum("jobs_stored"),
|
||||
jobs_duplicate_sum=Sum("jobs_duplicate"),
|
||||
jobs_failed_sum=Sum("jobs_failed"),
|
||||
).group_by("source").values(
|
||||
"source",
|
||||
"jobs_fetched_sum",
|
||||
"jobs_stored_sum",
|
||||
"jobs_duplicate_sum",
|
||||
"jobs_failed_sum",
|
||||
)
|
||||
for row in job_dist_rows:
|
||||
source = row["source"]
|
||||
if source not in source_stats:
|
||||
continue
|
||||
source_stats[source]["jobs_fetched"] = int(row["jobs_fetched_sum"] or 0)
|
||||
source_stats[source]["jobs_stored"] = int(row["jobs_stored_sum"] or 0)
|
||||
source_stats[source]["jobs_duplicate"] = int(row["jobs_duplicate_sum"] or 0)
|
||||
source_stats[source]["jobs_failed"] = int(row["jobs_failed_sum"] or 0)
|
||||
|
||||
stats["details"] = [{"source": k, **v} for k, v in source_stats.items()]
|
||||
|
||||
return Success(data=stats)
|
||||
|
||||
|
||||
@ -65,44 +97,44 @@ async def get_companies_list(
|
||||
status: Optional[str] = Query(None)
|
||||
):
|
||||
"""分页获取待处理公司列表详情"""
|
||||
client = await clickhouse_manager.get_client()
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
where_clauses = []
|
||||
|
||||
VALID_SOURCES = {"boss", "qcwy", "zhilian"}
|
||||
VALID_STATUSES = {"pending", "processing", "done", "failed"}
|
||||
|
||||
queue_query = CompanyCleaningQueue.all()
|
||||
if source:
|
||||
where_clauses.append(f"source = '{source}'")
|
||||
if source not in VALID_SOURCES:
|
||||
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
|
||||
queue_query = queue_query.filter(source=source)
|
||||
if status:
|
||||
where_clauses.append(f"status = '{status}'")
|
||||
|
||||
where_sql = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
||||
|
||||
# Count
|
||||
count_sql = f"SELECT count() FROM job_data.pending_company FINAL {where_sql}"
|
||||
count_res = await client.query(count_sql)
|
||||
total = count_res.result_rows[0][0] if count_res.result_rows else 0
|
||||
|
||||
# Data
|
||||
sql = f"""
|
||||
SELECT source, company_id, company_name, status, error_msg, created_at, updated_at
|
||||
FROM job_data.pending_company FINAL
|
||||
{where_sql}
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT {page_size} OFFSET {offset}
|
||||
"""
|
||||
res = await client.query(sql)
|
||||
|
||||
if status not in VALID_STATUSES:
|
||||
return SuccessExtra(data=[], total=0, page=page, page_size=page_size)
|
||||
queue_query = queue_query.filter(status=status)
|
||||
|
||||
total = await queue_query.count()
|
||||
rows = await queue_query.order_by("-updated_at").offset(offset).limit(page_size)
|
||||
|
||||
data = []
|
||||
for row in res.result_rows:
|
||||
for row in rows:
|
||||
data.append({
|
||||
"source": row[0],
|
||||
"company_id": row[1],
|
||||
"company_name": row[2],
|
||||
"status": row[3],
|
||||
"error_msg": row[4],
|
||||
"created_at": row[5].isoformat() if row[5] else None,
|
||||
"updated_at": row[6].isoformat() if row[6] else None
|
||||
"source": row.source,
|
||||
"company_id": row.company_id,
|
||||
"company_name": row.company_name,
|
||||
"status": row.status,
|
||||
"error_msg": row.error_msg,
|
||||
"retry_count": row.retry_count,
|
||||
"started_at": row.started_at.isoformat() if row.started_at else None,
|
||||
"finished_at": row.finished_at.isoformat() if row.finished_at else None,
|
||||
"jobs_fetched": row.jobs_fetched,
|
||||
"jobs_stored": row.jobs_stored,
|
||||
"jobs_duplicate": row.jobs_duplicate,
|
||||
"jobs_failed": row.jobs_failed,
|
||||
"jobs_error_msg": row.jobs_error_msg,
|
||||
"created_at": row.created_at.isoformat() if row.created_at else None,
|
||||
"updated_at": row.updated_at.isoformat() if row.updated_at else None
|
||||
})
|
||||
|
||||
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@ -112,61 +144,30 @@ async def get_company_cleaning_detail(
|
||||
company_id: str = Query(..., description="公司ID"),
|
||||
company_name: Optional[str] = Query(None, description="公司名称"),
|
||||
):
|
||||
client = await clickhouse_manager.get_client()
|
||||
table_map = {
|
||||
"boss": "boss_company",
|
||||
"qcwy": "qcwy_company",
|
||||
"zhilian": "zhilian_company",
|
||||
}
|
||||
table = table_map.get(source)
|
||||
if not table:
|
||||
if source not in {"boss", "qcwy", "zhilian"}:
|
||||
return Success(code=400, msg="不支持的数据源")
|
||||
|
||||
if source == "qcwy":
|
||||
sql = f"""
|
||||
SELECT json_data, company_name, created_at, updated_at
|
||||
FROM job_data.{table}
|
||||
WHERE JSONExtractString(json_data, 'companyId') = {{company_id:String}}
|
||||
OR JSONExtractString(json_data, 'coId') = {{company_id:String}}
|
||||
OR JSONExtractString(json_data, 'coinfo', 'coid') = {{company_id:String}}
|
||||
OR company_name = {{company_name:String}}
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
params = {
|
||||
"company_id": str(company_id),
|
||||
"company_name": str(company_name or ""),
|
||||
}
|
||||
else:
|
||||
if not company_name:
|
||||
return Success(code=400, msg="缺少公司名称")
|
||||
sql = f"""
|
||||
SELECT json_data, company_name, created_at, updated_at
|
||||
FROM job_data.{table}
|
||||
WHERE company_name = {{company_name:String}}
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
params = {"company_name": str(company_name)}
|
||||
|
||||
print(f"DEBUG: Executing SQL: {sql}")
|
||||
print(f"DEBUG: Params: {params}")
|
||||
res = await client.query(sql, parameters=params)
|
||||
if not res.result_rows:
|
||||
model = company_storage.company_model(source)
|
||||
normalized_id = normalize_company_id(source, company_id)
|
||||
row = await model.get_or_none(source_company_id=normalized_id)
|
||||
if not row and company_name:
|
||||
row = await model.get_or_none(company_name=company_name)
|
||||
if not row:
|
||||
return Success(code=404, msg="未找到公司清洗结果")
|
||||
row = res.result_rows[0]
|
||||
raw_json = row[0]
|
||||
try:
|
||||
data = json.loads(raw_json)
|
||||
except Exception:
|
||||
data = {"raw": raw_json}
|
||||
|
||||
data = row.raw_json
|
||||
if isinstance(data, str):
|
||||
try:
|
||||
data = json.loads(data)
|
||||
except Exception:
|
||||
data = {"raw": data}
|
||||
return Success(
|
||||
data={
|
||||
"source": source,
|
||||
"company_id": company_id,
|
||||
"company_name": row[1],
|
||||
"created_at": row[2].isoformat() if row[2] else None,
|
||||
"updated_at": row[3].isoformat() if row[3] else None,
|
||||
"company_id": row.source_company_id,
|
||||
"company_name": row.company_name,
|
||||
"created_at": row.created_at.isoformat() if row.created_at else None,
|
||||
"updated_at": row.updated_at.isoformat() if row.updated_at else None,
|
||||
"data": data,
|
||||
}
|
||||
)
|
||||
@ -178,10 +179,13 @@ async def collect_pending_companies_api(
|
||||
source: Optional[str] = Body(None, embed=True)
|
||||
):
|
||||
"""
|
||||
分析招聘数据,收集待处理的公司ID到 pending_company 表
|
||||
分析招聘数据,收集待处理的公司ID到 MySQL 队列表
|
||||
"""
|
||||
await company_cleaner.collect_pending_companies(limit=limit, source=source)
|
||||
return Success(msg=f"已完成数据分析,已收集待处理公司(上限 {limit} 条)")
|
||||
summary = await company_cleaner.collect_pending_companies(limit=limit, source=source)
|
||||
return Success(
|
||||
msg=f"已完成数据分析,本次新增 {summary['total_created']} 条待处理公司",
|
||||
data=summary,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/run-pending", summary="手动执行待处理公司清洗")
|
||||
@ -212,7 +216,7 @@ async def crawl_execute_pending(
|
||||
proxy: Optional[str] = Body(None, embed=True),
|
||||
max_delay_seconds: int = Body(5, embed=True),
|
||||
):
|
||||
await company_cleaner.collect_pending_companies(source=source)
|
||||
await company_cleaner.collect_pending_companies(limit=limit, source=source)
|
||||
await company_cleaner.process_pending_companies(
|
||||
limit=limit,
|
||||
source=source,
|
||||
@ -222,6 +226,40 @@ async def crawl_execute_pending(
|
||||
return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务")
|
||||
|
||||
|
||||
@router.post("/update-company-status", summary="更新公司爬取状态(爬虫端调用)")
|
||||
async def update_company_status(
|
||||
source: str = Body(..., embed=True),
|
||||
company_id: str = Body(..., embed=True),
|
||||
status: str = Body(..., embed=True),
|
||||
error_message: str = Body("", embed=True),
|
||||
):
|
||||
"""爬虫完成公司数据抓取后,调用此接口更新 MySQL 队列状态"""
|
||||
VALID_STATUSES = {"done", "failed"}
|
||||
if status not in VALID_STATUSES:
|
||||
return Success(msg=f"无效状态: {status},仅支持 {VALID_STATUSES}", code=400)
|
||||
|
||||
normalized_id = normalize_company_id(source, company_id)
|
||||
queue, _ = await CompanyCleaningQueue.get_or_create(
|
||||
source=source,
|
||||
company_id=normalized_id,
|
||||
defaults={
|
||||
"company_name": "",
|
||||
"status": "pending",
|
||||
"error_msg": "",
|
||||
"retry_count": 0,
|
||||
"started_at": None,
|
||||
"finished_at": None,
|
||||
},
|
||||
)
|
||||
queue.status = status
|
||||
queue.error_msg = error_message or ""
|
||||
queue.finished_at = datetime.now()
|
||||
if status == "failed":
|
||||
queue.retry_count += 1
|
||||
await queue.save()
|
||||
return Success(msg="状态更新成功", data={"source": source, "company_id": company_id, "status": status})
|
||||
|
||||
|
||||
@router.post("/process-company", summary="执行单个公司清洗任务")
|
||||
async def process_single_company_api(
|
||||
source: str = Body(..., embed=True),
|
||||
|
||||
@ -1,46 +0,0 @@
|
||||
from typing import Optional, List, Dict, Any
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.services.ingest_service import IngestService
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class IngestSingleRequest(BaseModel):
|
||||
platform: str = Field(...)
|
||||
data_type: str = Field(...)
|
||||
data: Dict[str, Any] = Field(...)
|
||||
check_duplicate: bool = Field(True)
|
||||
|
||||
|
||||
class IngestBatchRequest(BaseModel):
|
||||
platform: str = Field(...)
|
||||
data_type: str = Field(...)
|
||||
data_list: List[Dict[str, Any]] = Field(...)
|
||||
check_duplicate: bool = Field(True)
|
||||
|
||||
|
||||
async def get_service() -> IngestService:
|
||||
client: AsyncClient = await clickhouse_manager.get_client()
|
||||
return IngestService(client)
|
||||
|
||||
|
||||
@router.post("/data")
|
||||
async def ingest_data(req: IngestSingleRequest, service: IngestService = Depends(get_service)):
|
||||
try:
|
||||
res = await service.store_single(req.platform, req.data_type, req.data, req.check_duplicate)
|
||||
return {"code": 200, "data": res, "message": "ok"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/batch")
|
||||
async def ingest_batch(req: IngestBatchRequest, service: IngestService = Depends(get_service)):
|
||||
try:
|
||||
res = await service.store_batch(req.platform, req.data_type, req.data_list, req.check_duplicate)
|
||||
return {"code": 200, "data": res, "message": "ok"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@ -3,6 +3,6 @@ from fastapi import APIRouter
|
||||
from .job import router
|
||||
|
||||
job_router = APIRouter()
|
||||
job_router.include_router(router, tags=["数据上报"])
|
||||
job_router.include_router(router, tags=["数据入库"])
|
||||
|
||||
__all__ = ["job_router"]
|
||||
|
||||
@ -5,7 +5,12 @@ from pydantic import BaseModel, Field
|
||||
|
||||
from app.controllers.keyword import KeywordController
|
||||
from app.core.dependency import DependPermission
|
||||
from app.schemas.keyword import KeywordCreate, KeywordUpdate
|
||||
from app.schemas.keyword import (
|
||||
CrawlCompleteRequest,
|
||||
KeywordCreate,
|
||||
KeywordUpdate,
|
||||
PageProgressRequest,
|
||||
)
|
||||
|
||||
router = APIRouter(tags=["关键词接口"])
|
||||
|
||||
@ -34,18 +39,14 @@ async def get_available(
|
||||
source: str,
|
||||
limit: int = 1,
|
||||
reserve: bool = True,
|
||||
crawler_id: str = "",
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""根据平台获取当天未使用的检索条件
|
||||
|
||||
参数:
|
||||
source: 平台标识,boss|qcwy|zhilian
|
||||
limit: 返回数量上限,默认1
|
||||
|
||||
返回:
|
||||
标准字典结构,包含 items/total/limit
|
||||
优先级: partial(断点续爬) > failed(重试) > 全新关键词
|
||||
"""
|
||||
return await controller.get_available(source, limit, reserve)
|
||||
return await controller.get_available(source, limit, reserve, crawler_id)
|
||||
|
||||
|
||||
@router.post("/mark-used", summary="将检索条件标记为今日已使用")
|
||||
@ -180,3 +181,32 @@ async def delete_keyword(
|
||||
删除结果
|
||||
"""
|
||||
return await controller.delete_keyword(source, id)
|
||||
|
||||
|
||||
@router.post("/page-progress", summary="爬虫汇报单页爬取进度")
|
||||
async def report_page_progress(
|
||||
request: PageProgressRequest,
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""爬虫每完成一页后调用此接口汇报进度"""
|
||||
return await controller.report_page_progress(
|
||||
request.source,
|
||||
request.keyword_id,
|
||||
request.page,
|
||||
request.total_pages,
|
||||
request.jobs_found,
|
||||
)
|
||||
|
||||
|
||||
@router.post("/crawl-complete", summary="爬虫汇报爬取完成或失败")
|
||||
async def report_crawl_complete(
|
||||
request: CrawlCompleteRequest,
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""爬虫完成或失败后调用此接口更新状态"""
|
||||
return await controller.report_crawl_complete(
|
||||
request.source,
|
||||
request.keyword_id,
|
||||
request.status,
|
||||
request.error_message,
|
||||
)
|
||||
|
||||
@ -1,13 +1,12 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, Tuple
|
||||
from fastapi import APIRouter, Query, Body, Path, BackgroundTasks
|
||||
from fastapi.background import P
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
from fastapi import APIRouter, Query, Body, Path
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.controllers.token import token_controller
|
||||
from app.schemas.base import Fail, Success, SuccessExtra
|
||||
from app.schemas.token import BossTokenUpdate,BossTokenCreate
|
||||
from app.schemas.base import Success, SuccessExtra
|
||||
from app.schemas.token import BossTokenUpdate, BossTokenCreate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@ -15,7 +14,7 @@ token_router = APIRouter()
|
||||
|
||||
# 简单内存缓存:key 为查询参数组合,value 为 (缓存时间戳, 响应数据)
|
||||
_TOKENS_CACHE: Dict[Tuple[Any, Any, int, int], Tuple[float, Dict[str, Any]]] = {}
|
||||
_CACHE_TTL_SECONDS: int =60
|
||||
_CACHE_TTL_SECONDS: int = 60
|
||||
|
||||
|
||||
@token_router.get("/tokens", summary="获取Boss Token列表")
|
||||
@ -23,18 +22,30 @@ async def list_boss_tokens(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
status: int = Query(None, description="状态筛选"),
|
||||
wt2: Optional[str] = Query(None),
|
||||
mpt: Optional[str] = Query(None),
|
||||
):
|
||||
"""获取Boss Token列表"""
|
||||
from tortoise.expressions import Q
|
||||
|
||||
"""获取Boss Token列表(带缓存)"""
|
||||
cache_key: Tuple[Any, Any, int, int] = (wt2, mpt, page, page_size)
|
||||
now = time.monotonic()
|
||||
cached = _TOKENS_CACHE.get(cache_key)
|
||||
if cached and (now - cached[0] < _CACHE_TTL_SECONDS):
|
||||
return cached[1]
|
||||
|
||||
q = Q()
|
||||
if status is not None:
|
||||
q &= Q(status=status)
|
||||
|
||||
if wt2:
|
||||
q &= Q(wt2__icontains=wt2)
|
||||
if mpt:
|
||||
q &= Q(mpt__icontains=mpt)
|
||||
|
||||
total, token_objs = await token_controller.get_tokens(page=page, page_size=page_size, search=q)
|
||||
data = [await obj.to_dict() for obj in token_objs]
|
||||
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
resp = SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
_TOKENS_CACHE[cache_key] = (now, resp)
|
||||
return resp
|
||||
|
||||
|
||||
@token_router.get("/tokens/{token_id}", summary="获取Boss Token详情")
|
||||
@ -53,7 +64,6 @@ async def create_boss_token(
|
||||
):
|
||||
"""创建Boss Token"""
|
||||
await token_controller.create_token(token_data)
|
||||
# 清空缓存,确保新数据立即生效
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(msg="创建成功")
|
||||
|
||||
@ -65,7 +75,6 @@ async def update_boss_token(
|
||||
):
|
||||
"""更新Boss Token"""
|
||||
await token_controller.update_token(token_id, token_data)
|
||||
# 清空缓存,确保更新立即生效
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(msg="更新成功")
|
||||
|
||||
@ -76,7 +85,6 @@ async def delete_boss_token(
|
||||
):
|
||||
"""删除Boss Token"""
|
||||
await token_controller.delete_token(token_id)
|
||||
# 清空缓存,确保删除立即生效
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(msg="删除成功")
|
||||
|
||||
@ -84,104 +92,7 @@ async def delete_boss_token(
|
||||
@token_router.post("/tokens/cache/clear", summary="强制清除Token缓存")
|
||||
async def clear_token_cache():
|
||||
"""强制清除Token列表缓存"""
|
||||
global _TOKENS_CACHE
|
||||
cache_size = len(_TOKENS_CACHE)
|
||||
_TOKENS_CACHE.clear()
|
||||
logger.info(f"手动清除Token缓存,清除了 {cache_size} 条缓存数据")
|
||||
return Success(msg=f"成功清除 {cache_size} 条Token缓存")
|
||||
from typing import Optional, Dict, Any
|
||||
from fastapi import APIRouter, Query, HTTPException
|
||||
from tortoise.transactions import in_transaction
|
||||
from app.models.token import BossToken
|
||||
from app.schemas.base import Success
|
||||
|
||||
token_router = APIRouter()
|
||||
|
||||
|
||||
@token_router.get("/tokens")
|
||||
async def list_tokens(
|
||||
wt2: Optional[str] = Query(None),
|
||||
mpt: Optional[str] = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(10, ge=1, le=200),
|
||||
):
|
||||
"""获取 BossToken 列表,带两小时内存缓存。
|
||||
|
||||
Args:
|
||||
wt2 (Optional[str]): 按 `wt2` 模糊匹配。
|
||||
mpt (Optional[str]): 按 `mpt` 模糊匹配。
|
||||
page (int): 页码。
|
||||
page_size (int): 每页数量。
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 响应字典,包含 `code`、`data`、`total`。
|
||||
"""
|
||||
cache_key: Tuple[Any, Any, int, int] = (wt2, mpt, page, page_size)
|
||||
now = time.monotonic()
|
||||
cached = _TOKENS_CACHE.get(cache_key)
|
||||
if cached and (now - cached[0] < _CACHE_TTL_SECONDS):
|
||||
return cached[1]
|
||||
|
||||
qs = BossToken.all()
|
||||
if wt2:
|
||||
qs = qs.filter(wt2__icontains=wt2)
|
||||
if mpt:
|
||||
qs = qs.filter(mpt__icontains=mpt)
|
||||
total = await qs.count()
|
||||
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
|
||||
data = [
|
||||
{
|
||||
"id": item.id,
|
||||
"wt2": item.wt2,
|
||||
"mpt": item.mpt,
|
||||
"is_active": item.is_active,
|
||||
"failed_count": item.failed_count,
|
||||
"last_used_time": item.last_used_time,
|
||||
"created_at": item.created_at,
|
||||
}
|
||||
for item in items
|
||||
]
|
||||
resp: Dict[str, Any] = {"code": 200, "data": data, "total": total}
|
||||
_TOKENS_CACHE[cache_key] = (now, resp)
|
||||
return resp
|
||||
|
||||
|
||||
@token_router.post("/tokens")
|
||||
async def create_token(payload: Dict[str, Any]):
|
||||
try:
|
||||
async with in_transaction():
|
||||
item = await BossToken.create(
|
||||
wt2=payload.get("wt2"),
|
||||
mpt=payload.get("mpt"),
|
||||
is_active=bool(payload.get("is_active", True)),
|
||||
failed_count=int(payload.get("failed_count", 0)),
|
||||
last_used_time=payload.get("last_used_time"),
|
||||
)
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(data={"id": item.id})
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
@token_router.put("/tokens/{id}")
|
||||
async def update_token(id: int, payload: Dict[str, Any]):
|
||||
token_id = id
|
||||
item = await BossToken.get_or_none(id=token_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Token not found")
|
||||
for field in ["wt2", "mpt", "is_active", "failed_count", "last_used_time"]:
|
||||
if field in payload:
|
||||
setattr(item, field, payload[field])
|
||||
await item.save()
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(data={"id": item.id})
|
||||
|
||||
|
||||
@token_router.delete("/tokens/{token_id}")
|
||||
async def delete_token(token_id: int):
|
||||
item = await BossToken.get_or_none(id=token_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Token not found")
|
||||
await item.delete()
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(data={"id": token_id})
|
||||
@ -1,19 +1,44 @@
|
||||
"""
|
||||
公司搜索控制器 — 使用新 crawler service 替代已删除的 company_spider
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from company_spider.qcwy_company.spider import search_company as qcwy_search_company
|
||||
from company_spider.zhilianzhaopin_company.spider import crawl_companies
|
||||
from loguru import logger
|
||||
|
||||
from app.services.crawler.qcwy import QcwyService
|
||||
from app.services.crawler.zhilian import ZhilianService
|
||||
|
||||
|
||||
class CompanyController:
|
||||
async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
|
||||
return await asyncio.to_thread(qcwy_search_company, keyword)
|
||||
def __init__(self):
|
||||
self._qcwy = QcwyService()
|
||||
self._zhilian = ZhilianService()
|
||||
|
||||
async def search_zhilian_company(self, keyword: str, city: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
params = {"kw": keyword}
|
||||
if city:
|
||||
params["city"] = city
|
||||
return await asyncio.to_thread(crawl_companies, params, 10)
|
||||
async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
return await asyncio.to_thread(self._qcwy.get_company_info, keyword)
|
||||
except Exception as e:
|
||||
logger.error(f"Qcwy company search failed: {e}")
|
||||
return None
|
||||
|
||||
async def search_zhilian_company(
|
||||
self, keyword: str, city: Optional[str] = None
|
||||
) -> List[Dict[str, Any]]:
|
||||
try:
|
||||
result = await asyncio.to_thread(
|
||||
self._zhilian.search_company_jobs_by_name, keyword
|
||||
)
|
||||
if result and isinstance(result, dict):
|
||||
data = result.get("data", {})
|
||||
if isinstance(data, dict):
|
||||
return data.get("list", [])
|
||||
return []
|
||||
return []
|
||||
except Exception as e:
|
||||
logger.error(f"Zhilian company search failed: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def create_company_controller() -> CompanyController:
|
||||
|
||||
@ -1,224 +0,0 @@
|
||||
from typing import Dict, Any, List, Optional
|
||||
from fastapi import HTTPException, BackgroundTasks
|
||||
from app.services.job import DataRouterService, DataType, PlatformType
|
||||
from app.log import logger
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class UniversalDataRequest(BaseModel):
|
||||
"""通用数据存储请求模型"""
|
||||
data: Dict[str, Any] = Field(..., description="要存储的数据")
|
||||
data_type: DataType = Field(..., description="数据类型 (job/company)")
|
||||
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
|
||||
check_duplicate: bool = Field(True, description="是否检查重复数据")
|
||||
|
||||
|
||||
class BatchDataRequest(BaseModel):
|
||||
"""批量数据存储请求模型"""
|
||||
data_list: List[Dict[str, Any]] = Field(..., description="要存储的数据列表")
|
||||
data_type: DataType = Field(..., description="数据类型 (job/company)")
|
||||
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
|
||||
check_duplicate: bool = Field(True, description="是否检查重复数据")
|
||||
|
||||
|
||||
class UniversalDataController:
|
||||
"""通用数据控制器 - 处理所有平台的数据存储请求"""
|
||||
|
||||
def __init__(self, data_router_service: DataRouterService):
|
||||
self.data_router_service = data_router_service
|
||||
|
||||
async def store_single_data(self, request: UniversalDataRequest) -> Dict[str, Any]:
|
||||
"""存储单条数据"""
|
||||
try:
|
||||
# logger.info(f"接收到 {request.platform} {request.data_type} 数据存储请求")
|
||||
|
||||
result = await self.data_router_service.store_data(
|
||||
data=request.data,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200 if result["success"] else 400,
|
||||
"message": result["message"],
|
||||
"data": result,
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"存储单条数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"数据存储失败: {str(e)}")
|
||||
|
||||
async def store_batch_data(self, request: BatchDataRequest) -> Dict[str, Any]:
|
||||
"""批量存储数据"""
|
||||
try:
|
||||
# logger.info(
|
||||
# f"接收到 {request.platform} {request.data_type} 批量数据存储请求,共 {len(request.data_list)} 条")
|
||||
|
||||
result = await self.data_router_service.batch_store_data(
|
||||
data_list=request.data_list,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": f"批量处理完成: 成功 {result['success']} 条,失败 {result['failed']} 条,重复 {result['duplicate']} 条",
|
||||
"data": result,
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量存储数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"批量数据存储失败: {str(e)}")
|
||||
|
||||
async def store_single_data_async(self,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: UniversalDataRequest) -> Dict[str, Any]:
|
||||
"""异步存储单条数据"""
|
||||
try:
|
||||
# logger.info(f"接收到 {request.platform} {request.data_type} 异步数据存储请求")
|
||||
|
||||
# 添加后台任务
|
||||
background_tasks.add_task(
|
||||
self._async_store_single_data,
|
||||
request
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 202,
|
||||
"message": "数据已加入异步处理队列",
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步存储单条数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"异步数据存储失败: {str(e)}")
|
||||
|
||||
async def store_batch_data_async(self,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: BatchDataRequest) -> Dict[str, Any]:
|
||||
"""异步批量存储数据"""
|
||||
try:
|
||||
# 打印接收日志
|
||||
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
|
||||
logger.info(f"📥 收到批量请求: [{platform_name}] {request.data_type.value} x{len(request.data_list)} 条")
|
||||
|
||||
# 添加后台任务
|
||||
background_tasks.add_task(
|
||||
self._async_store_batch_data,
|
||||
request
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 202,
|
||||
"message": f"批量数据已加入异步处理队列,共 {len(request.data_list)} 条",
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步批量存储数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"异步批量数据存储失败: {str(e)}")
|
||||
|
||||
async def _async_store_single_data(self, request: UniversalDataRequest):
|
||||
"""异步存储单条数据的后台任务"""
|
||||
try:
|
||||
|
||||
result = await self.data_router_service.store_data(
|
||||
data=request.data,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
if result["success"]:
|
||||
logger.info(f"异步存储 {request.platform} {request.data_type} 数据成功")
|
||||
else:
|
||||
logger.warning(f"异步存储 {request.platform} {request.data_type} 数据失败: {result['message']}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步存储单条数据后台任务失败: {str(e)}")
|
||||
|
||||
async def _async_store_batch_data(self, request: BatchDataRequest):
|
||||
"""异步批量存储数据的后台任务"""
|
||||
try:
|
||||
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
|
||||
|
||||
result = await self.data_router_service.batch_store_data(
|
||||
data_list=request.data_list,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
logger.info(f"✅ 批量处理完成: [{platform_name}] 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']} 条")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步批量存储数据后台任务失败: {str(e)}")
|
||||
|
||||
async def query_data(self, platform: PlatformType, data_type: DataType,
|
||||
page: int = 1, page_size: int = 20) -> Dict[str, Any]:
|
||||
"""查询数据"""
|
||||
try:
|
||||
logger.info(f"查询 {platform} {data_type} 数据,页码: {page}, 页大小: {page_size}")
|
||||
|
||||
offset = (page - 1) * page_size
|
||||
result = await self.data_router_service.query_json_data(
|
||||
platform=platform,
|
||||
data_type=data_type,
|
||||
limit=page_size,
|
||||
offset=offset
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "查询数据成功",
|
||||
"data": {
|
||||
"items": result.get("data", []),
|
||||
"total": result.get("count", 0),
|
||||
"page": page,
|
||||
"page_size": page_size
|
||||
},
|
||||
"platform": platform,
|
||||
"data_type": data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"查询数据失败: {str(e)}")
|
||||
|
||||
async def get_supported_platforms(self) -> Dict[str, Any]:
|
||||
"""获取支持的平台和数据类型"""
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "获取支持的平台和数据类型成功",
|
||||
"data": {
|
||||
"platforms": [platform.value for platform in PlatformType],
|
||||
"data_types": [data_type.value for data_type in DataType],
|
||||
"platform_duplicate_keys": {
|
||||
"boss": {
|
||||
"job": "job_id",
|
||||
"company": "company_name"
|
||||
},
|
||||
"qcwy": {
|
||||
"job": "job_id + update_date_time",
|
||||
"company": "company_name"
|
||||
},
|
||||
"zhilian": {
|
||||
"job": "number + first_publish_time",
|
||||
"company": "company_name"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# 创建控制器实例的工厂函数
|
||||
def create_universal_data_controller(data_router_service: DataRouterService) -> UniversalDataController:
|
||||
return UniversalDataController(data_router_service)
|
||||
@ -4,7 +4,6 @@ from typing import Any, Dict, List, Type
|
||||
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.keyword import BossKeyword, QcwyKeyword, ZhilianKeyword
|
||||
|
||||
|
||||
@ -16,103 +15,177 @@ class KeywordController:
|
||||
"zhilian": ZhilianKeyword,
|
||||
}
|
||||
|
||||
async def get_available(self, source: str, limit: int = 1, reserve: bool = True) -> Dict[str, Any]:
|
||||
"""获取当天未使用的检索条件(城市+岗位)
|
||||
async def get_available(
|
||||
self, source: str, limit: int = 1, reserve: bool = True, crawler_id: str = ""
|
||||
) -> Dict[str, Any]:
|
||||
"""获取可用关键词,优先返回断点续爬和失败重试的关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
limit: 返回数量上限
|
||||
reserve: 是否立即标记为已使用
|
||||
|
||||
返回:
|
||||
包含 items/total/limit 的字典结构
|
||||
|
||||
注意:使用原子操作避免并发时的竞态条件
|
||||
优先级:
|
||||
1. partial(断点续爬)
|
||||
2. failed 且 retry_count < 3(失败重试)
|
||||
3. 全新未使用关键词
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
today = date.today()
|
||||
now = datetime.now()
|
||||
|
||||
# 先统计总数
|
||||
search = Q(last_requested_date__not=today) | Q(last_requested_date=None)
|
||||
total = await model.filter(search).count()
|
||||
|
||||
# 优先级 1: 断点续爬 (partial)
|
||||
partial_q = Q(crawl_status="partial", last_requested_date=today)
|
||||
# 优先级 2: 失败重试 (failed, retry < 3)
|
||||
failed_q = Q(crawl_status="failed", last_requested_date=today, retry_count__lt=3)
|
||||
# 优先级 3: 全新关键词
|
||||
fresh_q = Q(last_requested_date__not=today) | Q(last_requested_date=None)
|
||||
|
||||
items = []
|
||||
|
||||
if total > 0 and reserve:
|
||||
# 使用原子操作:先更新,再查询已更新的记录
|
||||
# 这样可以避免查询和标记之间的竞态条件
|
||||
take = max(1, min(limit, total))
|
||||
|
||||
|
||||
for priority, query, is_fresh in [
|
||||
("partial", partial_q, False),
|
||||
("failed", failed_q, False),
|
||||
("fresh", fresh_q, True),
|
||||
]:
|
||||
count = await model.filter(query).count()
|
||||
if count == 0:
|
||||
continue
|
||||
|
||||
take = max(1, min(limit - len(items), count))
|
||||
if take <= 0:
|
||||
break
|
||||
|
||||
try:
|
||||
# 获取一批未使用的记录ID(随机选择)
|
||||
candidate_records = await model.filter(search).offset(
|
||||
random.randint(0, max(0, total - take))
|
||||
).limit(take).only('id')
|
||||
|
||||
candidate_ids = [r.id for r in candidate_records]
|
||||
|
||||
if candidate_ids:
|
||||
# 原子性地更新这些记录(只更新未使用的)
|
||||
# 使用数据库的原子UPDATE操作
|
||||
updated_count = await model.filter(
|
||||
id__in=candidate_ids
|
||||
).filter(
|
||||
Q(last_requested_date__isnull=True) | Q(last_requested_date__not=today)
|
||||
).update(
|
||||
last_requested_date=today,
|
||||
last_requested_at=now
|
||||
)
|
||||
|
||||
# 查询成功更新的记录
|
||||
if updated_count > 0:
|
||||
records = await model.filter(
|
||||
id__in=candidate_ids,
|
||||
last_requested_date=today
|
||||
).limit(updated_count)
|
||||
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
|
||||
except Exception as e:
|
||||
# 如果原子操作失败,回退到原来的方法
|
||||
import logging
|
||||
logging.warning(f"原子操作失败,回退到原方法: {e}")
|
||||
take = max(1, min(limit, total))
|
||||
start = 0 if total == take else random.randint(0, total - take)
|
||||
records = await model.filter(search).offset(start).limit(take)
|
||||
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
|
||||
offset = random.randint(0, max(0, count - take))
|
||||
candidates = await model.filter(query).offset(offset).limit(take).only("id")
|
||||
candidate_ids = [r.id for r in candidates]
|
||||
|
||||
if not candidate_ids:
|
||||
continue
|
||||
|
||||
update_fields = {
|
||||
"last_requested_at": now,
|
||||
"crawl_status": "crawling",
|
||||
"crawl_started_at": now,
|
||||
"crawler_id": crawler_id,
|
||||
}
|
||||
|
||||
if is_fresh:
|
||||
update_fields["last_requested_date"] = today
|
||||
update_fields["last_completed_page"] = 0
|
||||
update_fields["total_pages"] = 0
|
||||
update_fields["jobs_found"] = 0
|
||||
update_fields["error_message"] = ""
|
||||
update_fields["retry_count"] = 0
|
||||
|
||||
if reserve:
|
||||
ids = [r.id for r in records]
|
||||
await self.mark_used(source, ids)
|
||||
elif total > 0:
|
||||
# 如果不需要reserve,直接查询
|
||||
take = max(1, min(limit, total))
|
||||
start = 0 if total == take else random.randint(0, total - take)
|
||||
records = await model.filter(search).offset(start).limit(take)
|
||||
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
|
||||
|
||||
await model.filter(id__in=candidate_ids).update(**update_fields)
|
||||
|
||||
records = await model.filter(id__in=candidate_ids).limit(take)
|
||||
for r in records:
|
||||
items.append({
|
||||
"id": r.id,
|
||||
"city": r.city,
|
||||
"job": r.job,
|
||||
"last_completed_page": r.last_completed_page,
|
||||
"crawl_status": r.crawl_status,
|
||||
})
|
||||
|
||||
if len(items) >= limit:
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
total_available = await model.filter(
|
||||
partial_q | failed_q | fresh_q
|
||||
).count()
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "查询可用检索条件成功",
|
||||
"data": {
|
||||
"items": items,
|
||||
"total": total,
|
||||
"total": total_available,
|
||||
"limit": limit,
|
||||
},
|
||||
}
|
||||
|
||||
async def report_page_progress(
|
||||
self,
|
||||
source: str,
|
||||
keyword_id: int,
|
||||
page: int,
|
||||
total_pages: int = 0,
|
||||
jobs_found: int = 0,
|
||||
) -> Dict[str, Any]:
|
||||
"""爬虫汇报单页完成进度"""
|
||||
model = self._ensure_model(source)
|
||||
obj = await model.filter(id=keyword_id).first()
|
||||
if not obj:
|
||||
return {"code": 404, "message": "关键词不存在"}
|
||||
|
||||
obj.last_completed_page = page
|
||||
if total_pages > 0:
|
||||
obj.total_pages = total_pages
|
||||
obj.jobs_found = (obj.jobs_found or 0) + jobs_found
|
||||
await obj.save(update_fields=["last_completed_page", "total_pages", "jobs_found"])
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "进度更新成功",
|
||||
"data": {
|
||||
"keyword_id": keyword_id,
|
||||
"last_completed_page": obj.last_completed_page,
|
||||
"total_pages": obj.total_pages,
|
||||
"jobs_found": obj.jobs_found,
|
||||
},
|
||||
}
|
||||
|
||||
async def report_crawl_complete(
|
||||
self,
|
||||
source: str,
|
||||
keyword_id: int,
|
||||
status: str,
|
||||
error_message: str = "",
|
||||
) -> Dict[str, Any]:
|
||||
"""爬虫汇报爬取完成或失败"""
|
||||
model = self._ensure_model(source)
|
||||
obj = await model.filter(id=keyword_id).first()
|
||||
if not obj:
|
||||
return {"code": 404, "message": "关键词不存在"}
|
||||
|
||||
if status not in ("completed", "failed"):
|
||||
return {"code": 400, "message": "status 仅支持 completed/failed"}
|
||||
|
||||
obj.crawl_status = status
|
||||
obj.error_message = error_message
|
||||
update_fields = ["crawl_status", "error_message"]
|
||||
|
||||
if status == "failed":
|
||||
obj.retry_count = (obj.retry_count or 0) + 1
|
||||
update_fields.append("retry_count")
|
||||
|
||||
await obj.save(update_fields=update_fields)
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": f"爬取状态已更新为 {status}",
|
||||
"data": {
|
||||
"keyword_id": keyword_id,
|
||||
"crawl_status": obj.crawl_status,
|
||||
"retry_count": obj.retry_count,
|
||||
},
|
||||
}
|
||||
|
||||
async def get_stats(self, source: str, on_date: date | None = None) -> Dict[str, Any]:
|
||||
"""统计指定平台在某日期的使用与未使用数量
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
on_date: 统计日期,不传则为今天
|
||||
|
||||
返回:
|
||||
包含 total/used/unused 的字典结构
|
||||
"""
|
||||
"""统计指定平台关键词使用和爬取状态"""
|
||||
model = self._ensure_model(source)
|
||||
d = on_date or date.today()
|
||||
total = await model.all().count()
|
||||
used = await model.filter(last_requested_date=d).count()
|
||||
unused = max(0, total - used)
|
||||
|
||||
crawling = await model.filter(crawl_status="crawling", last_requested_date=d).count()
|
||||
completed = await model.filter(crawl_status="completed", last_requested_date=d).count()
|
||||
failed = await model.filter(crawl_status="failed", last_requested_date=d).count()
|
||||
partial = await model.filter(crawl_status="partial", last_requested_date=d).count()
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "统计成功",
|
||||
@ -121,19 +194,17 @@ class KeywordController:
|
||||
"total": total,
|
||||
"used": used,
|
||||
"unused": unused,
|
||||
"crawl_status": {
|
||||
"crawling": crawling,
|
||||
"completed": completed,
|
||||
"failed": failed,
|
||||
"partial": partial,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
async def mark_used(self, source: str, ids: List[int]) -> Dict[str, Any]:
|
||||
"""将检索条件标记为今日已使用
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
ids: 需要标记的记录主键ID列表
|
||||
|
||||
返回:
|
||||
更新结果,包括成功条数与日期
|
||||
"""
|
||||
"""将检索条件标记为今日已使用"""
|
||||
model = self._ensure_model(source)
|
||||
updated = 0
|
||||
now = datetime.now()
|
||||
@ -166,18 +237,7 @@ class KeywordController:
|
||||
city: str | None = None,
|
||||
job: str | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""获取关键词列表
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
page: 页码
|
||||
page_size: 每页数量
|
||||
city: 城市过滤
|
||||
job: 职位过滤
|
||||
|
||||
返回:
|
||||
包含列表数据和分页信息的字典
|
||||
"""
|
||||
"""获取关键词列表"""
|
||||
model = self._ensure_model(source)
|
||||
queryset = model.all()
|
||||
if city:
|
||||
@ -193,6 +253,11 @@ class KeywordController:
|
||||
"job",
|
||||
"last_requested_date",
|
||||
"last_requested_at",
|
||||
"crawl_status",
|
||||
"last_completed_page",
|
||||
"total_pages",
|
||||
"jobs_found",
|
||||
"retry_count",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
)
|
||||
@ -207,17 +272,8 @@ class KeywordController:
|
||||
}
|
||||
|
||||
async def create_keyword(self, source: str, obj_in: Any) -> Dict[str, Any]:
|
||||
"""创建关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
obj_in: 创建数据对象
|
||||
|
||||
返回:
|
||||
创建结果
|
||||
"""
|
||||
"""创建关键词"""
|
||||
model = self._ensure_model(source)
|
||||
# Check if already exists
|
||||
exists = await model.filter(city=obj_in.city, job=obj_in.job).exists()
|
||||
if exists:
|
||||
return {"code": 400, "message": "该关键词组合已存在"}
|
||||
@ -235,16 +291,7 @@ class KeywordController:
|
||||
return {"code": 200, "message": "创建成功", "data": data}
|
||||
|
||||
async def update_keyword(self, source: str, id: int, obj_in: Any) -> Dict[str, Any]:
|
||||
"""更新关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
id: 记录ID
|
||||
obj_in: 更新数据对象
|
||||
|
||||
返回:
|
||||
更新结果
|
||||
"""
|
||||
"""更新关键词"""
|
||||
model = self._ensure_model(source)
|
||||
obj = await model.filter(id=id).first()
|
||||
if not obj:
|
||||
@ -252,7 +299,6 @@ class KeywordController:
|
||||
|
||||
update_data = obj_in.model_dump(exclude_unset=True)
|
||||
if update_data:
|
||||
# Check for duplicates if updating city or job
|
||||
if "city" in update_data or "job" in update_data:
|
||||
city = update_data.get("city", obj.city)
|
||||
job = update_data.get("job", obj.job)
|
||||
@ -275,15 +321,7 @@ class KeywordController:
|
||||
return {"code": 200, "message": "更新成功", "data": data}
|
||||
|
||||
async def delete_keyword(self, source: str, id: int) -> Dict[str, Any]:
|
||||
"""删除关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
id: 记录ID
|
||||
|
||||
返回:
|
||||
删除结果
|
||||
"""
|
||||
"""删除关键词"""
|
||||
model = self._ensure_model(source)
|
||||
obj = await model.filter(id=id).first()
|
||||
if not obj:
|
||||
@ -296,20 +334,26 @@ class KeywordController:
|
||||
}
|
||||
|
||||
async def get_overview_stats(self) -> Dict[str, Any]:
|
||||
"""获取所有平台的统计概览
|
||||
|
||||
返回:
|
||||
包含各平台统计数据的字典
|
||||
"""
|
||||
"""获取所有平台的统计概览"""
|
||||
today = date.today()
|
||||
stats = {}
|
||||
for source, model in self._model_map.items():
|
||||
total = await model.all().count()
|
||||
used = await model.filter(last_requested_date=today).count()
|
||||
crawling = await model.filter(crawl_status="crawling", last_requested_date=today).count()
|
||||
completed = await model.filter(crawl_status="completed", last_requested_date=today).count()
|
||||
failed = await model.filter(crawl_status="failed", last_requested_date=today).count()
|
||||
partial_count = await model.filter(crawl_status="partial", last_requested_date=today).count()
|
||||
stats[source] = {
|
||||
"total": total,
|
||||
"used": used,
|
||||
"unused": max(0, total - used),
|
||||
"crawl_status": {
|
||||
"crawling": crawling,
|
||||
"completed": completed,
|
||||
"failed": failed,
|
||||
"partial": partial_count,
|
||||
},
|
||||
}
|
||||
return {
|
||||
"code": 200,
|
||||
@ -318,14 +362,7 @@ class KeywordController:
|
||||
}
|
||||
|
||||
def _ensure_model(self, source: str) -> Type:
|
||||
"""根据平台标识返回对应模型类型
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
|
||||
返回:
|
||||
对应的 Tortoise ORM 模型类型
|
||||
"""
|
||||
"""根据平台标识返回对应模型类型"""
|
||||
model = self._model_map.get(source)
|
||||
if not model:
|
||||
raise ValueError("不支持的平台标识")
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
import time
|
||||
import os
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
import random
|
||||
|
||||
|
||||
@ -48,9 +48,11 @@ class ClickHouseManager:
|
||||
async def close(self):
|
||||
"""关闭ClickHouse连接"""
|
||||
if self._client:
|
||||
await self._client.close()
|
||||
close_result = self._client.close()
|
||||
if close_result is not None:
|
||||
await close_result
|
||||
self._client = None
|
||||
|
||||
|
||||
# 全局ClickHouse管理器实例
|
||||
clickhouse_manager = ClickHouseManager()
|
||||
clickhouse_manager = ClickHouseManager()
|
||||
|
||||
@ -4,235 +4,198 @@ from app.log import logger
|
||||
|
||||
class ClickHouseInitializer:
|
||||
"""ClickHouse数据库初始化器"""
|
||||
|
||||
|
||||
# 6张数据表的 DDL 定义(含 channel 列)
|
||||
_TABLE_DDLS = {
|
||||
"boss_job": """
|
||||
CREATE TABLE IF NOT EXISTS job_data.boss_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '',
|
||||
job_id String DEFAULT '',
|
||||
channel String DEFAULT 'mini',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
""",
|
||||
"boss_company": """
|
||||
CREATE TABLE IF NOT EXISTS job_data.boss_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '',
|
||||
company_name String DEFAULT '',
|
||||
channel String DEFAULT 'mini',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
""",
|
||||
"qcwy_job": """
|
||||
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '',
|
||||
job_id String DEFAULT '',
|
||||
update_date_time String DEFAULT '',
|
||||
channel String DEFAULT 'mini',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
""",
|
||||
"qcwy_company": """
|
||||
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '',
|
||||
company_name String DEFAULT '',
|
||||
channel String DEFAULT 'mini',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
""",
|
||||
"zhilian_job": """
|
||||
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '',
|
||||
number String DEFAULT '',
|
||||
first_publish_time String DEFAULT '',
|
||||
channel String DEFAULT 'mini',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
""",
|
||||
"zhilian_company": """
|
||||
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '',
|
||||
company_name String DEFAULT '',
|
||||
channel String DEFAULT 'mini',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
""",
|
||||
}
|
||||
|
||||
_PENDING_COMPANY_DDL = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.pending_company (
|
||||
source String,
|
||||
company_id String,
|
||||
company_name String DEFAULT '',
|
||||
status String DEFAULT 'pending',
|
||||
error_msg String DEFAULT '',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now(),
|
||||
version UInt64 DEFAULT 1
|
||||
) ENGINE = ReplacingMergeTree(version)
|
||||
ORDER BY (source, company_id)
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
_JOB_ANALYTICS_VIEW = """
|
||||
CREATE OR REPLACE VIEW job_data.job_analytics AS
|
||||
SELECT
|
||||
'boss' as source,
|
||||
job_id,
|
||||
channel,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'brandName') as company_name,
|
||||
JSONExtractString(json_data, 'salaryDesc') as salary_text,
|
||||
0.0 as salary_min,
|
||||
0.0 as salary_max,
|
||||
JSONExtractString(json_data, 'cityName') as city,
|
||||
JSONExtractString(json_data, 'experienceName') as experience_required,
|
||||
JSONExtractString(json_data, 'degreeName') as education,
|
||||
created_at
|
||||
FROM job_data.boss_job
|
||||
UNION ALL
|
||||
SELECT
|
||||
'qcwy' as source,
|
||||
job_id,
|
||||
channel,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'companyName') as company_name,
|
||||
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
|
||||
0.0, 0.0,
|
||||
JSONExtractString(json_data, 'workCity') as city,
|
||||
JSONExtractString(json_data, 'workYear') as experience_required,
|
||||
JSONExtractString(json_data, 'degree') as education,
|
||||
created_at
|
||||
FROM job_data.qcwy_job
|
||||
UNION ALL
|
||||
SELECT
|
||||
'zhilian' as source,
|
||||
number as job_id,
|
||||
channel,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'companyName') as company_name,
|
||||
JSONExtractString(json_data, 'salary60') as salary_text,
|
||||
0.0, 0.0,
|
||||
JSONExtractString(json_data, 'workCity') as city,
|
||||
JSONExtractString(json_data, 'workingExp') as experience_required,
|
||||
JSONExtractString(json_data, 'education') as education,
|
||||
created_at
|
||||
FROM job_data.zhilian_job
|
||||
"""
|
||||
|
||||
# 需要添加 channel 列的表
|
||||
_CHANNEL_MIGRATION_TABLES = [
|
||||
"boss_job", "boss_company",
|
||||
"qcwy_job", "qcwy_company",
|
||||
"zhilian_job", "zhilian_company",
|
||||
]
|
||||
|
||||
def __init__(self, client: AsyncClient):
|
||||
self.client = client
|
||||
|
||||
async def create_boss_job_json_table(self):
|
||||
"""创建BOSS招聘职位JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.boss_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
job_id String DEFAULT '', -- BOSS平台去重字段:jobBaseInfoVO.jobId
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
async def _create_table(self, name: str, ddl: str) -> None:
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("BOSS职位JSON数据表 boss_job 创建成功")
|
||||
await self.client.command(ddl)
|
||||
logger.info(f"表 {name} 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建BOSS职位JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_boss_company_json_table(self):
|
||||
"""创建BOSS招聘公司JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.boss_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
company_name String DEFAULT '', -- 公司名称去重字段
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("BOSS公司JSON数据表 boss_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建BOSS公司JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_qcwy_job_json_table(self):
|
||||
"""创建前程无忧职位JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
job_id String DEFAULT '', -- QCWY平台去重字段:jobId
|
||||
update_date_time String DEFAULT '', -- QCWY平台去重字段:updateDateTime
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("前程无忧职位JSON数据表 qcwy_job 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建前程无忧职位JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_qcwy_company_json_table(self):
|
||||
"""创建前程无忧公司JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
company_name String DEFAULT '', -- 公司名称去重字段
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("前程无忧公司JSON数据表 qcwy_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建前程无忧公司JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_zhilian_job_json_table(self):
|
||||
"""创建智联招聘职位JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
number String DEFAULT '', -- 智联平台去重字段:number
|
||||
first_publish_time String DEFAULT '', -- 智联平台去重字段:firstPublishTime
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("智联招聘职位JSON数据表 zhilian_job 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建智联招聘职位JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_zhilian_company_json_table(self):
|
||||
"""创建智联招聘公司JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
company_name String DEFAULT '', -- 公司名称去重字段
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("智联招聘公司JSON数据表 zhilian_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建智联招聘公司JSON数据表失败: {e}")
|
||||
logger.error(f"创建表 {name} 失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_pending_company_table(self):
|
||||
"""创建待处理公司表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.pending_company (
|
||||
source String,
|
||||
company_id String,
|
||||
company_name String DEFAULT '',
|
||||
status String DEFAULT 'pending',
|
||||
error_msg String DEFAULT '',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now(),
|
||||
version UInt64 DEFAULT 1
|
||||
) ENGINE = ReplacingMergeTree(version)
|
||||
ORDER BY (source, company_id)
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("待处理公司表 pending_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建待处理公司表失败: {e}")
|
||||
raise
|
||||
async def initialize_channel_migration(self) -> None:
|
||||
"""对已存在的表执行 ALTER TABLE ADD COLUMN IF NOT EXISTS channel"""
|
||||
for table in self._CHANNEL_MIGRATION_TABLES:
|
||||
try:
|
||||
await self.client.command(
|
||||
f"ALTER TABLE job_data.{table} "
|
||||
f"ADD COLUMN IF NOT EXISTS channel String DEFAULT 'mini'"
|
||||
)
|
||||
logger.info(f"表 {table} channel 列迁移完成")
|
||||
except Exception as e:
|
||||
logger.warning(f"表 {table} channel 列迁移跳过: {e}")
|
||||
|
||||
async def create_job_analytics_view(self):
|
||||
"""创建统一的招聘数据分析视图"""
|
||||
create_view_sql = """
|
||||
CREATE VIEW IF NOT EXISTS job_data.job_analytics AS
|
||||
SELECT
|
||||
'boss' as source,
|
||||
job_id,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'brandName') as company_name,
|
||||
JSONExtractString(json_data, 'salaryDesc') as salary_text,
|
||||
0.0 as salary_min,
|
||||
0.0 as salary_max,
|
||||
JSONExtractString(json_data, 'cityName') as city,
|
||||
JSONExtractString(json_data, 'experienceName') as experience_required,
|
||||
JSONExtractString(json_data, 'degreeName') as education,
|
||||
created_at
|
||||
FROM job_data.boss_job
|
||||
UNION ALL
|
||||
SELECT
|
||||
'qcwy' as source,
|
||||
job_id,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'companyName') as company_name,
|
||||
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
|
||||
0.0, 0.0,
|
||||
JSONExtractString(json_data, 'workCity') as city,
|
||||
JSONExtractString(json_data, 'workYear') as experience_required,
|
||||
JSONExtractString(json_data, 'degree') as education,
|
||||
created_at
|
||||
FROM job_data.qcwy_job
|
||||
UNION ALL
|
||||
SELECT
|
||||
'zhilian' as source,
|
||||
number as job_id,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'companyName') as company_name,
|
||||
JSONExtractString(json_data, 'salary60') as salary_text,
|
||||
0.0, 0.0,
|
||||
JSONExtractString(json_data, 'workCity') as city,
|
||||
JSONExtractString(json_data, 'workingExp') as experience_required,
|
||||
JSONExtractString(json_data, 'education') as education,
|
||||
created_at
|
||||
FROM job_data.zhilian_job
|
||||
"""
|
||||
try:
|
||||
await self.client.command(create_view_sql)
|
||||
logger.info("招聘数据分析视图 job_analytics 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建招聘数据分析视图失败: {e}")
|
||||
raise
|
||||
|
||||
async def initialize_all_tables(self):
|
||||
async def initialize_all_tables(self) -> None:
|
||||
"""初始化所有表"""
|
||||
logger.info("开始初始化 ClickHouse 数据库表...")
|
||||
|
||||
|
||||
try:
|
||||
# 创建BOSS招聘JSON表
|
||||
await self.create_boss_job_json_table()
|
||||
await self.create_boss_company_json_table()
|
||||
|
||||
# 创建前程无忧JSON表
|
||||
await self.create_qcwy_job_json_table()
|
||||
await self.create_qcwy_company_json_table()
|
||||
|
||||
# 创建智联招聘JSON表
|
||||
await self.create_zhilian_job_json_table()
|
||||
await self.create_zhilian_company_json_table()
|
||||
# 创建6张数据表
|
||||
for name, ddl in self._TABLE_DDLS.items():
|
||||
await self._create_table(name, ddl)
|
||||
|
||||
# 创建待处理公司表
|
||||
await self.create_pending_company_table()
|
||||
await self._create_table("pending_company", self._PENDING_COMPANY_DDL)
|
||||
|
||||
# 创建统一分析视图
|
||||
await self.create_job_analytics_view()
|
||||
# 对已存在的表添加 channel 列
|
||||
await self.initialize_channel_migration()
|
||||
|
||||
# 创建/重建统一分析视图(含 channel 列)
|
||||
try:
|
||||
await self.client.command(self._JOB_ANALYTICS_VIEW)
|
||||
logger.info("招聘数据分析视图 job_analytics 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建招聘数据分析视图失败: {e}")
|
||||
raise
|
||||
|
||||
logger.info("ClickHouse 数据库表初始化完成")
|
||||
except Exception as e:
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
import os
|
||||
from typing import Optional, Dict, Any
|
||||
import jwt
|
||||
from fastapi import Depends, Header, HTTPException, Request
|
||||
from loguru import logger
|
||||
|
||||
from app.core.ctx import CTX_USER_ID
|
||||
from app.models import Role, User
|
||||
@ -23,7 +25,7 @@ class AuthControl:
|
||||
@classmethod
|
||||
async def is_authed(cls, token: str = Header(..., description="token验证")) -> Optional["User"]:
|
||||
try:
|
||||
if token == "dev":
|
||||
if token == "dev" and os.getenv("APP_ENV", "production") == "development":
|
||||
user = await User.filter().first()
|
||||
user_id = user.id
|
||||
else:
|
||||
@ -39,7 +41,8 @@ class AuthControl:
|
||||
except jwt.ExpiredSignatureError:
|
||||
raise HTTPException(status_code=401, detail="登录已过期")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"{repr(e)}")
|
||||
logger.error(f"Auth error: {repr(e)}")
|
||||
raise HTTPException(status_code=500, detail="Internal authentication error")
|
||||
|
||||
|
||||
class PermissionControl:
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from aerich import Command
|
||||
from fastapi import FastAPI
|
||||
@ -27,8 +28,9 @@ from app.schemas.menus import MenuType
|
||||
from app.settings.config import settings
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.core.clickhouse_init import ClickHouseInitializer
|
||||
from app.services.ingest.remote_push import close_http_client
|
||||
|
||||
from .middlewares import BackGroundTaskMiddleware, HttpAuditLogMiddleware
|
||||
from .middlewares import BackGroundTaskMiddleware
|
||||
from .ip_tracking import IpTrackingMiddleware
|
||||
|
||||
|
||||
@ -42,15 +44,6 @@ def make_middlewares():
|
||||
allow_headers=settings.CORS_ALLOW_HEADERS,
|
||||
),
|
||||
Middleware(BackGroundTaskMiddleware),
|
||||
Middleware(
|
||||
HttpAuditLogMiddleware,
|
||||
methods=["GET", "POST", "PUT", "DELETE"],
|
||||
exclude_paths=[
|
||||
"/api/v1/base/access_token",
|
||||
"/docs",
|
||||
"/openapi.json",
|
||||
],
|
||||
),
|
||||
Middleware(IpTrackingMiddleware),
|
||||
]
|
||||
return middleware
|
||||
@ -266,10 +259,15 @@ async def init_apis():
|
||||
async def init_db():
|
||||
"""执行数据库迁移(受环境开关与并发保护控制)"""
|
||||
command = Command(tortoise_config=settings.TORTOISE_ORM)
|
||||
await command.init_db(safe=True)
|
||||
await command.init()
|
||||
migration_dir = Path("migrations") / "models"
|
||||
if not migration_dir.exists():
|
||||
await command.init_db(safe=True)
|
||||
return
|
||||
try:
|
||||
await command.migrate()
|
||||
except FileExistsError as e:
|
||||
logger.info(f"跳过重复迁移文件生成: {e}")
|
||||
except AttributeError:
|
||||
logger.warning("unable to retrieve model history from database, model history will be created from scratch")
|
||||
shutil.rmtree("migrations")
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
||||
from starlette.requests import Request
|
||||
|
||||
@ -1,11 +1,15 @@
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from loguru import logger
|
||||
|
||||
|
||||
class DistributedLock:
|
||||
"""分布式锁封装,优先使用 Redis,不可用时降级为文件锁"""
|
||||
"""分布式锁封装,优先使用 Redis,不可用时降级为文件锁(带 TTL)"""
|
||||
|
||||
def __init__(self, name: str, ttl_seconds: int = 600):
|
||||
self.name = name
|
||||
@ -13,35 +17,77 @@ class DistributedLock:
|
||||
self.token = str(uuid.uuid4())
|
||||
self._use_redis = False
|
||||
self._redis = None
|
||||
self._file_path = f".lock_{self.name}"
|
||||
self._file_path = Path(tempfile.gettempdir()) / f"jobdata_lock_{self.name}"
|
||||
self._init_redis()
|
||||
|
||||
def _init_redis(self) -> None:
|
||||
try:
|
||||
import redis # type: ignore
|
||||
import redis.asyncio as aioredis
|
||||
from app.settings.config import settings
|
||||
self._redis = redis.Redis(
|
||||
host=getattr(settings, "REDIS_HOST", None) or "",
|
||||
|
||||
host = getattr(settings, "REDIS_HOST", None) or ""
|
||||
if not host:
|
||||
return
|
||||
self._redis = aioredis.Redis(
|
||||
host=host,
|
||||
port=getattr(settings, "REDIS_PORT", 6379),
|
||||
db=getattr(settings, "REDIS_DB", 0),
|
||||
password=getattr(settings, "REDIS_PASS", None) or None,
|
||||
socket_timeout=3,
|
||||
)
|
||||
# 尝试 ping
|
||||
if self._redis.ping():
|
||||
self._use_redis = True
|
||||
self._use_redis = True
|
||||
except Exception:
|
||||
self._use_redis = False
|
||||
|
||||
async def _ping_redis(self) -> bool:
|
||||
if not self._redis:
|
||||
return False
|
||||
try:
|
||||
return bool(await self._redis.ping())
|
||||
except Exception:
|
||||
self._use_redis = False
|
||||
return False
|
||||
|
||||
async def acquire(self) -> bool:
|
||||
"""获取锁,返回是否成功"""
|
||||
if self._use_redis and self._redis is not None:
|
||||
try:
|
||||
# NX+EX 设置锁,避免竞争
|
||||
return bool(self._redis.set(f"lock:{self.name}", self.token, nx=True, ex=self.ttl))
|
||||
if not await self._ping_redis():
|
||||
return self._try_file_lock()
|
||||
return bool(await self._redis.set(
|
||||
f"lock:{self.name}", self.token, nx=True, ex=self.ttl
|
||||
))
|
||||
except Exception:
|
||||
pass
|
||||
# 文件锁降级(单机安全)
|
||||
return self._try_file_lock()
|
||||
|
||||
def _try_file_lock(self) -> bool:
|
||||
"""文件锁(带 TTL 过期检查),使用绝对路径"""
|
||||
lock_dir = self._file_path
|
||||
lock_meta = lock_dir / "meta"
|
||||
try:
|
||||
os.mkdir(self._file_path)
|
||||
lock_dir.mkdir()
|
||||
lock_meta.write_text(str(time.time()))
|
||||
return True
|
||||
except FileExistsError:
|
||||
if lock_meta.exists():
|
||||
try:
|
||||
created = float(lock_meta.read_text())
|
||||
if time.time() - created > self.ttl:
|
||||
logger.warning(
|
||||
f"Stale file lock detected for '{self.name}', "
|
||||
f"age={time.time() - created:.0f}s > ttl={self.ttl}s. Cleaning up."
|
||||
)
|
||||
shutil.rmtree(lock_dir, ignore_errors=True)
|
||||
try:
|
||||
lock_dir.mkdir()
|
||||
lock_meta.write_text(str(time.time()))
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
return False
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
@ -49,15 +95,14 @@ class DistributedLock:
|
||||
"""释放锁"""
|
||||
if self._use_redis and self._redis is not None:
|
||||
try:
|
||||
# 简单释放;生产建议使用 Lua 脚本确保原子性
|
||||
key = f"lock:{self.name}"
|
||||
val = self._redis.get(key)
|
||||
val = await self._redis.get(key)
|
||||
if val and val.decode() == self.token:
|
||||
self._redis.delete(key)
|
||||
await self._redis.delete(key)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(self._file_path)
|
||||
shutil.rmtree(self._file_path, ignore_errors=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@ -72,4 +117,4 @@ class DistributedLock:
|
||||
yield False
|
||||
finally:
|
||||
if acquired:
|
||||
await self.release()
|
||||
await self.release()
|
||||
|
||||
@ -8,6 +8,7 @@ from pathlib import Path
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
from tortoise.exceptions import OperationalError
|
||||
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.core.locks import DistributedLock
|
||||
@ -50,9 +51,6 @@ async def stats_job():
|
||||
("boss", "job", "boss_job"),
|
||||
("qcwy", "job", "qcwy_job"),
|
||||
("zhilian", "job", "zhilian_job"),
|
||||
("boss", "company", "boss_company"),
|
||||
("qcwy", "company", "qcwy_company"),
|
||||
("zhilian", "company", "zhilian_company"),
|
||||
]
|
||||
results: list[dict] = []
|
||||
for source, data_type, table in tables:
|
||||
@ -170,32 +168,46 @@ async def ecs_full_pipeline_job():
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def _get_active_proxy() -> "str | None":
|
||||
"""从数据库读取可用代理配置,优先 platform='all'"""
|
||||
from app.models.cleaning import ProxyConfig
|
||||
|
||||
proxy_obj = await ProxyConfig.filter(is_active=True).order_by("platform").first()
|
||||
if proxy_obj:
|
||||
logger.info(f"company_cleaning_job using proxy: {proxy_obj.name} ({proxy_obj.proxy_url[:30]}...)")
|
||||
return proxy_obj.proxy_url
|
||||
return None
|
||||
|
||||
|
||||
async def company_cleaning_job():
|
||||
"""每5分钟执行:自动清洗待处理公司数据"""
|
||||
from app.services.company_cleaner import company_cleaner
|
||||
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "company_cleaning_job"
|
||||
# Use a shorter lock TTL since it runs frequently
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=300)
|
||||
|
||||
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
logger.info("company_cleaning_job skipped: lock not acquired")
|
||||
return
|
||||
|
||||
|
||||
try:
|
||||
logger.info("Running automated company cleaning job...")
|
||||
# 1. Collect new data (with 7-day rule)
|
||||
# 减少数量,确保在5分钟内完成
|
||||
await company_cleaner.collect_pending_companies(limit=50)
|
||||
|
||||
# 2. Process pending data with small delay to be polite
|
||||
|
||||
# 2. 从数据库读取代理配置
|
||||
proxy = await _get_active_proxy()
|
||||
|
||||
# 3. Process pending data with small delay to be polite
|
||||
# 减少数量,确保在5分钟内完成(30个公司,每个约3-5秒,加上延迟,总计约2-3分钟)
|
||||
# 这样留出时间给收集任务和其他操作
|
||||
await company_cleaner.process_pending_companies(limit=30, max_delay_seconds=1)
|
||||
|
||||
await company_cleaner.process_pending_companies(limit=30, max_delay_seconds=1, proxy=proxy)
|
||||
|
||||
duration = (datetime.now() - started_at).total_seconds()
|
||||
logger.info(f"company_cleaning_job completed in {duration:.2f} seconds")
|
||||
await _record_task_run(task_id, task_name, "success", started_at)
|
||||
@ -207,12 +219,12 @@ async def company_cleaning_job():
|
||||
async def daily_cleanup_job():
|
||||
"""每天 00:05 执行:清理已完成的任务记录"""
|
||||
from app.services.company_cleaner import company_cleaner
|
||||
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "daily_cleanup_job"
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=3600)
|
||||
|
||||
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
return
|
||||
@ -226,6 +238,34 @@ async def daily_cleanup_job():
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def stale_crawl_cleanup_job():
|
||||
"""每10分钟执行:将超过30分钟仍为crawling状态的关键词降级为partial"""
|
||||
from app.models.keyword import BossKeyword, QcwyKeyword, ZhilianKeyword
|
||||
|
||||
task_name = "stale_crawl_cleanup"
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=300)
|
||||
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
return
|
||||
|
||||
threshold = datetime.now() - timedelta(minutes=30)
|
||||
try:
|
||||
for model in [BossKeyword, QcwyKeyword, ZhilianKeyword]:
|
||||
count = await model.filter(
|
||||
crawl_status="crawling",
|
||||
crawl_started_at__lt=threshold,
|
||||
).update(crawl_status="partial")
|
||||
if count:
|
||||
logger.info(f"{model.__name__}: {count} stale crawl tasks marked as partial")
|
||||
except OperationalError as e:
|
||||
error_text = str(e)
|
||||
if "crawl_status" in error_text or "crawl_started_at" in error_text:
|
||||
logger.warning(f"stale_crawl_cleanup skipped due to missing keyword crawl columns: {error_text}")
|
||||
return
|
||||
raise
|
||||
|
||||
|
||||
async def _post_with_retry(body: str):
|
||||
"""带失败重试的统计结果上报"""
|
||||
import httpx
|
||||
@ -270,7 +310,7 @@ def _build_email_html(subject: str, payload: dict) -> str:
|
||||
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
|
||||
if "anomalies" in payload:
|
||||
rows = "".join(
|
||||
f"<tr><td>{a.get('source')}</td><td>{a.get('ip')}</td><td>{a.get('date')}</td></tr>" for a in payload.get("anomalies", [])
|
||||
f"<tr><td>{a.get('source')}</td><td>{a.get('ip')}</td><td>{a.get('last_report_at', 'N/A')}</td></tr>" for a in payload.get("anomalies", [])
|
||||
)
|
||||
table = f"<table><thead><tr><th>来源</th><th>IP</th><th>日期</th></tr></thead><tbody>{rows}</tbody></table>"
|
||||
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
|
||||
@ -326,8 +366,8 @@ def start_scheduler():
|
||||
)
|
||||
# 每6小时触发
|
||||
scheduler.add_job(stats_job, CronTrigger(second=0, minute=0, hour="*/6"), id="stats_job", replace_existing=True)
|
||||
# 每6小时触发:执行 ECS 全流程
|
||||
scheduler.add_job(ecs_full_pipeline_job, CronTrigger(second=0, minute=0, hour="*/6"), id="ecs_full_pipeline", replace_existing=True)
|
||||
# 每6小时触发:执行 ECS 全流程(偏移30分钟,避免与 stats_job 同时执行)
|
||||
scheduler.add_job(ecs_full_pipeline_job, CronTrigger(second=0, minute=30, hour="*/6"), id="ecs_full_pipeline", replace_existing=True)
|
||||
# 每10分钟触发告警
|
||||
scheduler.add_job(ip_alert_job, CronTrigger(second=0, minute="*/10"), id="ip_alert_job", replace_existing=True)
|
||||
# 每5分钟执行:自动清洗
|
||||
@ -341,6 +381,8 @@ def start_scheduler():
|
||||
)
|
||||
# 每天 00:05 执行:清理历史记录
|
||||
scheduler.add_job(daily_cleanup_job, CronTrigger(second=0, minute=5, hour=0), id="daily_cleanup_job", replace_existing=True)
|
||||
# 每10分钟执行:检测僵死爬取任务并降级为partial
|
||||
scheduler.add_job(stale_crawl_cleanup_job, CronTrigger(second=0, minute="*/10"), id="stale_crawl_cleanup", replace_existing=True)
|
||||
scheduler.start()
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
# 新增model需要在这里导入
|
||||
from .admin import *
|
||||
from .company import *
|
||||
from .metrics import *
|
||||
from .keyword import *
|
||||
from .cleaning import *
|
||||
|
||||
58
app/models/company.py
Normal file
58
app/models/company.py
Normal file
@ -0,0 +1,58 @@
|
||||
from tortoise import fields
|
||||
|
||||
from .base import BaseModel, TimestampMixin
|
||||
|
||||
|
||||
class BaseCompanyModel(BaseModel, TimestampMixin):
|
||||
source_company_id = fields.CharField(max_length=128, unique=True, index=True, description="来源站点公司ID")
|
||||
company_name = fields.CharField(max_length=255, index=True, description="公司名称")
|
||||
company_type = fields.CharField(max_length=100, null=True, description="公司性质")
|
||||
industry = fields.CharField(max_length=255, null=True, description="行业")
|
||||
company_size = fields.CharField(max_length=100, null=True, description="公司规模")
|
||||
financing_stage = fields.CharField(max_length=100, null=True, description="融资阶段")
|
||||
city = fields.CharField(max_length=100, null=True, description="城市")
|
||||
address = fields.TextField(null=True, description="地址")
|
||||
website = fields.CharField(max_length=500, null=True, description="官网")
|
||||
logo_url = fields.CharField(max_length=1000, null=True, description="Logo地址")
|
||||
description = fields.TextField(null=True, description="公司简介")
|
||||
raw_json = fields.JSONField(description="原始公司JSON")
|
||||
first_crawled_at = fields.DatetimeField(index=True, description="首次抓取时间")
|
||||
last_crawled_at = fields.DatetimeField(index=True, description="最后抓取时间")
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
|
||||
|
||||
class BossCompany(BaseCompanyModel):
|
||||
class Meta:
|
||||
table = "boss_company"
|
||||
|
||||
|
||||
class QcwyCompany(BaseCompanyModel):
|
||||
class Meta:
|
||||
table = "qcwy_company"
|
||||
|
||||
|
||||
class ZhilianCompany(BaseCompanyModel):
|
||||
class Meta:
|
||||
table = "zhilian_company"
|
||||
|
||||
|
||||
class CompanyCleaningQueue(BaseModel, TimestampMixin):
|
||||
source = fields.CharField(max_length=20, index=True, description="来源平台")
|
||||
company_id = fields.CharField(max_length=128, index=True, description="来源站点公司ID")
|
||||
company_name = fields.CharField(max_length=255, null=True, description="公司名称")
|
||||
status = fields.CharField(max_length=20, default="pending", index=True, description="状态")
|
||||
error_msg = fields.TextField(null=True, description="错误信息")
|
||||
retry_count = fields.IntField(default=0, description="重试次数")
|
||||
started_at = fields.DatetimeField(null=True, description="开始处理时间")
|
||||
finished_at = fields.DatetimeField(null=True, description="处理结束时间")
|
||||
jobs_fetched = fields.IntField(default=0, description="抓取到的职位数")
|
||||
jobs_stored = fields.IntField(default=0, description="写入ClickHouse成功数")
|
||||
jobs_duplicate = fields.IntField(default=0, description="职位重复数")
|
||||
jobs_failed = fields.IntField(default=0, description="职位写入失败数")
|
||||
jobs_error_msg = fields.TextField(null=True, description="职位同步错误信息")
|
||||
|
||||
class Meta:
|
||||
table = "company_cleaning_queue"
|
||||
unique_together = (("source", "company_id"),)
|
||||
@ -8,6 +8,17 @@ class BaseKeyword(Model):
|
||||
job = fields.CharField(max_length=128)
|
||||
last_requested_date = fields.DateField(null=True)
|
||||
last_requested_at = fields.DatetimeField(null=True)
|
||||
|
||||
# 爬取状态管理
|
||||
crawl_status = fields.CharField(max_length=16, default="idle")
|
||||
last_completed_page = fields.IntField(default=0)
|
||||
total_pages = fields.IntField(default=0)
|
||||
jobs_found = fields.IntField(default=0)
|
||||
crawl_started_at = fields.DatetimeField(null=True)
|
||||
crawler_id = fields.CharField(max_length=64, default="")
|
||||
error_message = fields.TextField(default="")
|
||||
retry_count = fields.IntField(default=0)
|
||||
|
||||
created_at = fields.DatetimeField(auto_now_add=True)
|
||||
updated_at = fields.DatetimeField(auto_now=True)
|
||||
|
||||
|
||||
@ -1,5 +1,3 @@
|
||||
import math
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
from typing import Optional, Dict, List
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ class AnalyticsQueryParams(BaseModel):
|
||||
position_name: Optional[str] = Field(None, description="职位名称筛选")
|
||||
industry: Optional[str] = Field(None, description="行业筛选")
|
||||
experience_required: Optional[str] = Field(None, description="经验要求筛选")
|
||||
channel: Optional[str] = Field(None, description="渠道筛选 (mini/web/app)")
|
||||
limit: int = Field(10, ge=1, le=100, description="返回结果数量限制")
|
||||
|
||||
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
from datetime import date, datetime
|
||||
from typing import Optional
|
||||
from typing import Literal, Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
@ -22,8 +22,28 @@ class KeywordOut(KeywordBase):
|
||||
id: int
|
||||
last_requested_date: Optional[date] = None
|
||||
last_requested_at: Optional[datetime] = None
|
||||
crawl_status: str = "idle"
|
||||
last_completed_page: int = 0
|
||||
total_pages: int = 0
|
||||
jobs_found: int = 0
|
||||
retry_count: int = 0
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class PageProgressRequest(BaseModel):
|
||||
source: str = Field(..., pattern="^(boss|qcwy|zhilian)$")
|
||||
keyword_id: int
|
||||
page: int = Field(..., ge=1)
|
||||
total_pages: int = Field(0, ge=0)
|
||||
jobs_found: int = Field(0, ge=0)
|
||||
|
||||
|
||||
class CrawlCompleteRequest(BaseModel):
|
||||
source: str = Field(..., pattern="^(boss|qcwy|zhilian)$")
|
||||
keyword_id: int
|
||||
status: Literal["completed", "failed"]
|
||||
error_message: str = ""
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
from typing import Optional, List
|
||||
|
||||
|
||||
class BossTokenCreate(BaseModel):
|
||||
|
||||
@ -1,32 +1,44 @@
|
||||
import asyncio
|
||||
import csv
|
||||
import io
|
||||
import re
|
||||
import time
|
||||
from typing import List, Dict, Any, Union, Optional
|
||||
from fastapi import UploadFile
|
||||
from loguru import logger
|
||||
from app.services.crawler.boss import BossService
|
||||
from app.services.crawler.qcwy import QcwyService
|
||||
from app.services.crawler.zhilian import ZhilianService
|
||||
from app.services.job import DataRouterService, DataType, PlatformType
|
||||
from app.services.company_jobs_sync import CompanyJobsSyncService
|
||||
from app.services.company_storage import company_storage
|
||||
from app.services.ingest import IngestService
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.models.token import BossToken
|
||||
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
|
||||
|
||||
|
||||
class CleaningService:
|
||||
_TOKEN_REFRESH_INTERVAL = 3600 # 1小时刷新一次
|
||||
|
||||
def __init__(self):
|
||||
self.boss_service = BossService()
|
||||
self.qcwy_service = QcwyService()
|
||||
self.zhilian_service = ZhilianService()
|
||||
self.data_router = None
|
||||
self.company_jobs_sync = CompanyJobsSyncService()
|
||||
self.data_router: Optional[IngestService] = None
|
||||
self._boss_token_loaded = False
|
||||
self._token_loaded_at: float = 0
|
||||
|
||||
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
||||
self.boss_service.set_proxy(proxy)
|
||||
self.qcwy_service.set_proxy(proxy)
|
||||
self.zhilian_service.set_proxy(proxy)
|
||||
self.company_jobs_sync.set_proxy(proxy)
|
||||
|
||||
async def _ensure_boss_token_loaded(self) -> None:
|
||||
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
|
||||
now = time.time()
|
||||
if (self._boss_token_loaded
|
||||
and self.boss_service.login_data.get("mpt")
|
||||
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL):
|
||||
return
|
||||
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
||||
if not token_obj:
|
||||
@ -34,21 +46,21 @@ class CleaningService:
|
||||
return
|
||||
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
||||
self._boss_token_loaded = True
|
||||
self._token_loaded_at = now
|
||||
|
||||
async def get_data_router(self) -> DataRouterService:
|
||||
async def get_data_router(self) -> IngestService:
|
||||
if not self.data_router:
|
||||
client = await clickhouse_manager.get_client()
|
||||
self.data_router = DataRouterService(client)
|
||||
self.data_router = IngestService(client)
|
||||
return self.data_router
|
||||
|
||||
async def parse_file(self, file: UploadFile) -> List[str]:
|
||||
content = await file.read()
|
||||
filename = file.filename
|
||||
|
||||
|
||||
targets = []
|
||||
if filename.endswith('.csv'):
|
||||
text = content.decode('utf-8')
|
||||
# Handle BOM
|
||||
if text.startswith('\uFEFF'):
|
||||
text = text[1:]
|
||||
reader = csv.reader(io.StringIO(text))
|
||||
@ -58,9 +70,22 @@ class CleaningService:
|
||||
else:
|
||||
text = content.decode('utf-8')
|
||||
targets = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
|
||||
|
||||
return [t for t in targets if t]
|
||||
|
||||
async def _store_company_record(
|
||||
self,
|
||||
source: str,
|
||||
data: Dict[str, Any],
|
||||
company_id: str,
|
||||
) -> Dict[str, Any]:
|
||||
result = await company_storage.upsert_company(source, data, company_id=company_id)
|
||||
result["duplicate"] = False
|
||||
result["remote_sent"] = False
|
||||
result["message"] = "公司数据已写入MySQL"
|
||||
result["original_data"] = data
|
||||
return result
|
||||
|
||||
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
|
||||
try:
|
||||
await self._ensure_boss_token_loaded()
|
||||
@ -90,7 +115,7 @@ class CleaningService:
|
||||
result = await self.clean_qcwy_company_jobs(target)
|
||||
elif platform == "zhilian":
|
||||
result = await self.clean_zhilian_company_jobs(target)
|
||||
|
||||
|
||||
if not result:
|
||||
return {
|
||||
"success": False,
|
||||
@ -99,9 +124,7 @@ class CleaningService:
|
||||
"storage_status": "failed",
|
||||
"remote_sent": False
|
||||
}
|
||||
|
||||
# Normalize result if it's just a dict (from store_data)
|
||||
# If it's a boolean (from some legacy paths), wrap it
|
||||
|
||||
if isinstance(result, bool):
|
||||
return {
|
||||
"success": result,
|
||||
@ -110,15 +133,15 @@ class CleaningService:
|
||||
"storage_status": "unknown",
|
||||
"remote_sent": False
|
||||
}
|
||||
|
||||
# If it's the dict returned by DataRouterService.store_data
|
||||
|
||||
return {
|
||||
"success": result.get("success", False),
|
||||
"target": target,
|
||||
"error": result.get("message") if not result.get("success") else None,
|
||||
"storage_status": "duplicate" if result.get("duplicate") else "saved",
|
||||
"remote_sent": result.get("remote_sent", False),
|
||||
"data_summary": result.get("data_summary"), # Optional: summary of data
|
||||
"data_summary": result.get("data_summary"),
|
||||
"jobs_summary": result.get("jobs_summary"),
|
||||
"original_data": result.get("original_data")
|
||||
}
|
||||
|
||||
@ -145,8 +168,7 @@ class CleaningService:
|
||||
router = await self.get_data_router()
|
||||
data = None
|
||||
result = None
|
||||
|
||||
# Try to extract ID if target looks like a URL
|
||||
|
||||
if platform == "boss":
|
||||
match = re.search(r'job_detail/([^.]+)\.html', target)
|
||||
if match:
|
||||
@ -159,55 +181,50 @@ class CleaningService:
|
||||
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
|
||||
if match:
|
||||
target = match.group(1)
|
||||
|
||||
|
||||
if platform == "boss":
|
||||
data = self.boss_service.get_job_detail_by_id(target)
|
||||
data = await asyncio.to_thread(self.boss_service.get_job_detail_by_id, target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.JOB, PlatformType.BOSS)
|
||||
result = await router.store_single("boss", "mini", "job", data)
|
||||
elif platform == "qcwy":
|
||||
data = self.qcwy_service.get_job_detail(target)
|
||||
data = await asyncio.to_thread(self.qcwy_service.get_job_detail, target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.JOB, PlatformType.QCWY)
|
||||
result = await router.store_single("qcwy", "mini", "job", data)
|
||||
elif platform == "zhilian":
|
||||
data = self.zhilian_service.get_job_detail(target)
|
||||
data = await asyncio.to_thread(self.zhilian_service.get_job_detail, target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN)
|
||||
|
||||
result = await router.store_single("zhilian", "mini", "job", data)
|
||||
|
||||
if result and isinstance(result, dict) and data:
|
||||
result['original_data'] = data
|
||||
return result
|
||||
|
||||
|
||||
return False
|
||||
|
||||
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
if platform == "boss":
|
||||
res = self.boss_service.search_jobs(target)
|
||||
res = await asyncio.to_thread(self.boss_service.search_jobs, target)
|
||||
if res and res.get('zpData') and res['zpData'].get('list'):
|
||||
# For company name search, we might get multiple jobs.
|
||||
# Currently we just return the result of the LAST one for simplicity in status reporting,
|
||||
# or we should change logic to handle list.
|
||||
# For now, let's just process them and return the last result as indicative.
|
||||
last_result = None
|
||||
for job in res['zpData']['list']:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
|
||||
|
||||
last_result = await router.store_single("boss", "mini", "job", job)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
# For search results, we store the full search response as original data
|
||||
last_result['original_data'] = res
|
||||
return last_result if last_result else False
|
||||
elif platform == "qcwy":
|
||||
res = self.qcwy_service.search_jobs(target)
|
||||
res = await asyncio.to_thread(self.qcwy_service.search_jobs, target)
|
||||
if res:
|
||||
last_result = None
|
||||
for job in res:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
|
||||
|
||||
last_result = await router.store_single("qcwy", "mini", "job", job)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result['original_data'] = res
|
||||
return last_result if last_result else False
|
||||
elif platform == "zhilian":
|
||||
res = self.zhilian_service.search_company_jobs_by_name(target)
|
||||
res = await asyncio.to_thread(self.zhilian_service.search_company_jobs_by_name, target)
|
||||
if res and isinstance(res, dict):
|
||||
data = res.get("data") or {}
|
||||
items = data.get("list") or []
|
||||
@ -215,148 +232,95 @@ class CleaningService:
|
||||
items = []
|
||||
last_result = None
|
||||
for job in items:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
|
||||
last_result = await router.store_single("zhilian", "mini", "job", job)
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = res
|
||||
return last_result if last_result else False
|
||||
return False
|
||||
|
||||
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
data = None
|
||||
result = None
|
||||
|
||||
|
||||
if platform == "boss":
|
||||
data = self.boss_service.get_company_detail_by_id(target)
|
||||
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS)
|
||||
result = await self._store_company_record("boss", data, target)
|
||||
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("boss", target)
|
||||
elif platform == "qcwy":
|
||||
company_id = target
|
||||
match = re.match(r"^co(\d+)$", company_id)
|
||||
if match:
|
||||
company_id = match.group(1)
|
||||
data = self.qcwy_service.get_company_info(company_id)
|
||||
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY)
|
||||
result = await self._store_company_record("qcwy", data, company_id)
|
||||
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("qcwy", company_id)
|
||||
elif platform == "zhilian":
|
||||
data = self.zhilian_service.get_company_detail(target)
|
||||
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN)
|
||||
|
||||
result = await self._store_company_record("zhilian", data, target)
|
||||
result["jobs_summary"] = await self.company_jobs_sync.sync_company_jobs("zhilian", target)
|
||||
|
||||
if result and isinstance(result, dict) and data:
|
||||
result['original_data'] = data
|
||||
return result
|
||||
|
||||
|
||||
return False
|
||||
|
||||
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
company_id = target
|
||||
|
||||
match = re.search(r'gongsi/([^.]+)\.html', target)
|
||||
if match:
|
||||
company_id = match.group(1)
|
||||
|
||||
data = self.boss_service.get_company_jobs_by_id(company_id)
|
||||
if not data:
|
||||
return False
|
||||
|
||||
jobs = []
|
||||
zp_data = data.get("zpData") if isinstance(data, dict) else None
|
||||
if isinstance(zp_data, dict):
|
||||
if isinstance(zp_data.get("jobList"), list):
|
||||
jobs = zp_data.get("jobList") or []
|
||||
elif isinstance(zp_data.get("list"), list):
|
||||
jobs = zp_data.get("list") or []
|
||||
|
||||
if not jobs:
|
||||
return False
|
||||
|
||||
last_result: Optional[Dict[str, Any]] = None
|
||||
for job in jobs:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = data
|
||||
return last_result
|
||||
|
||||
return False
|
||||
result = await self.company_jobs_sync.sync_company_jobs("boss", company_id)
|
||||
return result if result.get("jobs_fetched", 0) > 0 else False
|
||||
|
||||
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
company_id = target
|
||||
|
||||
match = re.match(r'^co(\d+)$', company_id)
|
||||
if match:
|
||||
company_id = match.group(1)
|
||||
|
||||
data = self.qcwy_service.get_company_jobs_by_id(company_id)
|
||||
if not data:
|
||||
return False
|
||||
|
||||
jobs_list = qcwy_extract_items(data)
|
||||
jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else []
|
||||
|
||||
if not jobs:
|
||||
return False
|
||||
|
||||
last_result: Optional[Dict[str, Any]] = None
|
||||
for job in jobs:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = data
|
||||
return last_result
|
||||
|
||||
return False
|
||||
result = await self.company_jobs_sync.sync_company_jobs("qcwy", company_id)
|
||||
return result if result.get("jobs_fetched", 0) > 0 else False
|
||||
|
||||
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
company_id = target
|
||||
|
||||
data = self.zhilian_service.get_company_jobs_by_id(company_id)
|
||||
if not data or not isinstance(data, dict):
|
||||
return False
|
||||
|
||||
data_field = data.get("data") or {}
|
||||
jobs = data_field.get("list") or []
|
||||
if not isinstance(jobs, list) or not jobs:
|
||||
return False
|
||||
|
||||
last_result: Optional[Dict[str, Any]] = None
|
||||
for job in jobs:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = data
|
||||
return last_result
|
||||
|
||||
return False
|
||||
result = await self.company_jobs_sync.sync_company_jobs("zhilian", company_id)
|
||||
return result if result.get("jobs_fetched", 0) > 0 else False
|
||||
|
||||
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
||||
job_match = re.search(r'job_detail/([^.]+)\.html', url)
|
||||
if job_match:
|
||||
return await self.clean_by_job_id(job_match.group(1), "boss")
|
||||
|
||||
|
||||
company_match = re.search(r'gongsi/([^.]+)\.html', url)
|
||||
if company_match:
|
||||
return await self.clean_by_company_id(company_match.group(1), "boss")
|
||||
|
||||
# Fallback: assume it's a job ID
|
||||
|
||||
return await self.clean_by_job_id(url, "boss")
|
||||
|
||||
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
||||
job_match = re.search(r'/(\d+)\.html', url)
|
||||
if job_match:
|
||||
return await self.clean_by_job_id(job_match.group(1), "qcwy")
|
||||
# Fallback: assume it's a job ID
|
||||
company_match = re.search(r'co(\d+)', url, re.IGNORECASE)
|
||||
if company_match:
|
||||
return await self.clean_by_company_id(company_match.group(1), "qcwy")
|
||||
return await self.clean_by_job_id(url, "qcwy")
|
||||
|
||||
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
||||
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
|
||||
if job_match:
|
||||
return await self.clean_by_job_id(job_match.group(1), "zhilian")
|
||||
# Fallback: assume it's a job ID
|
||||
company_match = re.search(r'/company/([A-Za-z0-9]+)', url)
|
||||
if company_match:
|
||||
return await self.clean_by_company_id(company_match.group(1), "zhilian")
|
||||
return await self.clean_by_job_id(url, "zhilian")
|
||||
|
||||
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:
|
||||
|
||||
@ -1,32 +1,69 @@
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.models.company import CompanyCleaningQueue
|
||||
from app.models.token import BossToken
|
||||
from app.services.company_jobs_sync import CompanyJobsSyncService
|
||||
from app.services.company_storage import company_storage, normalize_company_id
|
||||
from app.services.crawler.boss import BossService
|
||||
from app.services.crawler.qcwy import QcwyService
|
||||
from app.services.crawler.zhilian import ZhilianService
|
||||
|
||||
|
||||
SOURCE_CONFIGS: dict[str, dict[str, Any]] = {
|
||||
"zhilian": {
|
||||
"job_table": "zhilian_job",
|
||||
"company_id_expr": "JSONExtractString(json_data, 'companyNumber')",
|
||||
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
|
||||
"days_back": 30,
|
||||
"max_query_limit": None,
|
||||
},
|
||||
"qcwy": {
|
||||
"job_table": "qcwy_job",
|
||||
"company_id_expr": "JSONExtractString(json_data, 'coId')",
|
||||
"company_name_expr": "JSONExtractString(json_data, 'companyName')",
|
||||
"days_back": 30,
|
||||
"max_query_limit": 5000,
|
||||
},
|
||||
"boss": {
|
||||
"job_table": "boss_job",
|
||||
"company_id_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'encryptBrandId')",
|
||||
"company_name_expr": "JSONExtractString(json_data, 'brandComInfoVO', 'brandName')",
|
||||
"days_back": 30,
|
||||
"max_query_limit": None,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
class CompanyCleaner:
|
||||
_TOKEN_REFRESH_INTERVAL = 3600
|
||||
|
||||
def __init__(self):
|
||||
self.boss_service = BossService()
|
||||
self.qcwy_service = QcwyService()
|
||||
self.zhilian_service = ZhilianService()
|
||||
self.company_jobs_sync = CompanyJobsSyncService()
|
||||
self._boss_token_loaded = False
|
||||
self._token_loaded_at: float = 0
|
||||
|
||||
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
||||
self.boss_service.set_proxy(proxy)
|
||||
self.qcwy_service.set_proxy(proxy)
|
||||
self.zhilian_service.set_proxy(proxy)
|
||||
self.company_jobs_sync.set_proxy(proxy)
|
||||
|
||||
async def _ensure_boss_token_loaded(self) -> None:
|
||||
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
|
||||
now = time.time()
|
||||
if (
|
||||
self._boss_token_loaded
|
||||
and self.boss_service.login_data.get("mpt")
|
||||
and now - self._token_loaded_at < self._TOKEN_REFRESH_INTERVAL
|
||||
):
|
||||
return
|
||||
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
||||
if not token_obj:
|
||||
@ -34,420 +71,138 @@ class CompanyCleaner:
|
||||
return
|
||||
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
||||
self._boss_token_loaded = True
|
||||
self._token_loaded_at = now
|
||||
|
||||
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None):
|
||||
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None) -> dict[str, Any]:
|
||||
client = await clickhouse_manager.get_client()
|
||||
logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
|
||||
if source is None or source == "zhilian":
|
||||
await self._collect_zhilian(client, limit)
|
||||
if source is None or source == "qcwy":
|
||||
await self._collect_qcwy(client, limit)
|
||||
if source is None or source == "boss":
|
||||
await self._collect_boss(client, limit)
|
||||
summary: dict[str, Any] = {
|
||||
"total_created": 0,
|
||||
"sources": {},
|
||||
}
|
||||
sources = [s for s in SOURCE_CONFIGS if source is None or source == s]
|
||||
# 并行采集各平台
|
||||
tasks = [self._collect_source(client, s, limit) for s in sources]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for s, result in zip(sources, results):
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"Error collecting {s}: {result}")
|
||||
summary["sources"][s] = {"source": s, "created_count": 0, "error": str(result)}
|
||||
else:
|
||||
summary["sources"][s] = result
|
||||
summary["total_created"] += result["created_count"]
|
||||
logger.info("Finished collecting pending companies.")
|
||||
return summary
|
||||
|
||||
async def _collect_zhilian(self, client, limit: int):
|
||||
logger.info("Collecting Zhilian companies...")
|
||||
# 优化:先获取已存在的公司ID,避免在子查询中读取json_data
|
||||
# 使用PREWHERE提前过滤时间范围,减少需要读取的数据量
|
||||
# 检查90天内已处理的公司,避免重复请求
|
||||
days_back_existing = 90 # 查询最近90天的数据,避免重复请求已处理过的公司
|
||||
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
|
||||
FROM job_data.zhilian_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT 50000
|
||||
"""
|
||||
|
||||
# 添加重试机制
|
||||
existing_result = None
|
||||
existing_cids = set() # 默认使用空集合
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.info(f"Querying existing Zhilian companies (attempt {attempt+1})...")
|
||||
existing_result = await client.query(existing_companies_query)
|
||||
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
|
||||
break
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt == 0:
|
||||
days_back_existing = 1
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
|
||||
FROM job_data.zhilian_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT 5000
|
||||
"""
|
||||
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
|
||||
elif attempt == 1:
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
|
||||
FROM job_data.zhilian_company SAMPLE 0.1
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT 2000
|
||||
"""
|
||||
logger.warning(f"Memory error persists, using SAMPLE 0.1")
|
||||
else:
|
||||
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
|
||||
logger.warning("Using empty set for existing_cids, continuing with collection...")
|
||||
existing_cids = set()
|
||||
break
|
||||
else:
|
||||
logger.error(f"Non-memory error while querying existing companies: {e}")
|
||||
raise
|
||||
|
||||
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'zhilian'"
|
||||
pending_result = await client.query(pending_query)
|
||||
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
|
||||
|
||||
# 构建排除列表
|
||||
exclude_cids = existing_cids | pending_cids
|
||||
|
||||
# 优化:添加时间范围过滤,只查询最近30天的数据,减少扫描量
|
||||
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
|
||||
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'companyNumber') as cid,
|
||||
JSONExtractString(json_data, 'companyName') as cname
|
||||
FROM job_data.zhilian_job
|
||||
PREWHERE created_at > now() - INTERVAL 30 DAY
|
||||
WHERE json_data != ''
|
||||
AND JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT {limit * 2}
|
||||
"""
|
||||
logger.info(f"Executing SQL for Zhilian (limit={limit * 2}): {query[:500]}...")
|
||||
result = await client.query(query)
|
||||
if not result.result_rows:
|
||||
return
|
||||
|
||||
# 在 Python 中过滤掉已存在的和待处理的
|
||||
rows: List[Dict[str, Any]] = []
|
||||
for cid, cname in result.result_rows:
|
||||
if not cid or cid in exclude_cids:
|
||||
continue
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
rows.append(
|
||||
{
|
||||
"source": "zhilian",
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": "pending",
|
||||
"created_at": datetime.now(),
|
||||
"updated_at": datetime.now(),
|
||||
}
|
||||
)
|
||||
await self._insert_pending(client, rows)
|
||||
logger.info(f"Added {len(rows)} Zhilian companies to pending.")
|
||||
async def _collect_source(self, client, source: str, limit: int) -> dict[str, Any]:
|
||||
config = SOURCE_CONFIGS[source]
|
||||
|
||||
async def _collect_qcwy(self, client, limit: int):
|
||||
logger.info("Collecting QCWY companies...")
|
||||
# 优化:先获取已存在的公司ID,避免在子查询中读取json_data
|
||||
# 使用PREWHERE提前过滤时间范围,减少需要读取的数据量
|
||||
# 检查90天内已处理的公司,避免重复请求
|
||||
days_back_existing = 90 # 查询最近90天的数据,避免重复请求已处理过的公司
|
||||
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
|
||||
FROM job_data.qcwy_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyId') != ''
|
||||
LIMIT 50000
|
||||
"""
|
||||
|
||||
# 添加重试机制
|
||||
existing_result = None
|
||||
existing_cids = set() # 默认使用空集合
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.info(f"Querying existing QCWY companies (attempt {attempt+1})...")
|
||||
existing_result = await client.query(existing_companies_query)
|
||||
# 查询成功,提取结果
|
||||
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
|
||||
break
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt == 0:
|
||||
# 第一次失败:进一步减少时间范围
|
||||
days_back_existing = 1
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
|
||||
FROM job_data.qcwy_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyId') != ''
|
||||
LIMIT 5000
|
||||
"""
|
||||
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
|
||||
elif attempt == 1:
|
||||
# 第二次失败:使用采样
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
|
||||
FROM job_data.qcwy_company SAMPLE 0.1
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyId') != ''
|
||||
LIMIT 2000
|
||||
"""
|
||||
logger.warning(f"Memory error persists, using SAMPLE 0.1")
|
||||
else:
|
||||
# 最后一次尝试也失败,使用空集合继续执行(避免阻塞整个流程)
|
||||
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
|
||||
logger.warning("Using empty set for existing_cids, continuing with collection...")
|
||||
existing_cids = set()
|
||||
break
|
||||
else:
|
||||
# 其他错误直接抛出
|
||||
logger.error(f"Non-memory error while querying existing companies: {e}")
|
||||
raise
|
||||
|
||||
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'qcwy'"
|
||||
pending_result = await client.query(pending_query)
|
||||
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
|
||||
|
||||
# 构建排除列表
|
||||
exclude_cids = existing_cids | pending_cids
|
||||
|
||||
# 优化策略:
|
||||
# 1. 减少时间范围:从30天减少到7天,大幅减少扫描的数据量
|
||||
# 2. 减少LIMIT:从limit*2减少到更小的值,减少内存占用
|
||||
# 3. 使用更严格的PREWHERE条件:先过滤时间,再过滤空json_data和超大JSON
|
||||
# 4. 限制JSON大小:过滤掉过大的json_data(可能包含大量嵌套数据)
|
||||
# 5. 分批查询:如果limit较大,分批处理,每次查询更少的数据
|
||||
|
||||
days_back = 7 # 从30天减少到7天,减少扫描量
|
||||
# 注意:不使用length(json_data)检查,因为它需要读取整个列来计算长度
|
||||
query_limit = min(limit * 2, 100) # 限制最大查询数量,避免内存超限
|
||||
|
||||
# 分批查询策略:如果limit较大,分批处理
|
||||
result = None
|
||||
for attempt in range(3): # 最多尝试3次
|
||||
try:
|
||||
# 根据尝试次数调整参数
|
||||
if attempt == 1:
|
||||
# 第一次失败后:减少时间范围到3天
|
||||
days_back = 3
|
||||
query_limit = min(query_limit, 50)
|
||||
logger.warning(f"Retry {attempt}: Reducing time range to {days_back} days and limit to {query_limit}")
|
||||
elif attempt == 2:
|
||||
# 第二次失败后:使用采样
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'coId') as cid,
|
||||
JSONExtractString(json_data, 'companyName') as cname
|
||||
FROM job_data.qcwy_job SAMPLE 0.1
|
||||
PREWHERE created_at > now() - INTERVAL {days_back} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'coId') != ''
|
||||
LIMIT {query_limit}
|
||||
"""
|
||||
logger.warning(f"Retry {attempt}: Using SAMPLE 0.1 to reduce memory usage")
|
||||
result = await client.query(query)
|
||||
break
|
||||
|
||||
# 正常查询或第一次重试
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'coId') as cid,
|
||||
JSONExtractString(json_data, 'companyName') as cname
|
||||
FROM job_data.qcwy_job
|
||||
PREWHERE created_at > now() - INTERVAL {days_back} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'coId') != ''
|
||||
LIMIT {query_limit}
|
||||
"""
|
||||
|
||||
logger.info(f"Executing SQL for QCWY (limit={query_limit}, days={days_back}, attempt={attempt+1}): {query[:400]}...")
|
||||
result = await client.query(query)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
# 如果查询失败(可能是内存超限),继续重试
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt < 2:
|
||||
logger.warning(f"Memory error on attempt {attempt+1}: {e}")
|
||||
continue
|
||||
else:
|
||||
# 最后一次尝试也失败,抛出异常
|
||||
logger.error(f"Query failed after {attempt+1} attempts: {e}")
|
||||
raise
|
||||
else:
|
||||
# 其他错误直接抛出
|
||||
logger.error(f"Query failed with non-memory error: {e}")
|
||||
raise
|
||||
|
||||
if not result or not result.result_rows:
|
||||
logger.info("No QCWY companies found in query result.")
|
||||
return
|
||||
|
||||
# 在 Python 中过滤掉已存在的和待处理的
|
||||
rows: List[Dict[str, Any]] = []
|
||||
for cid, cname in result.result_rows:
|
||||
if not cid or cid in exclude_cids:
|
||||
continue
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
rows.append(
|
||||
{
|
||||
"source": "qcwy",
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": "pending",
|
||||
"created_at": datetime.now(),
|
||||
"updated_at": datetime.now(),
|
||||
}
|
||||
)
|
||||
|
||||
if rows:
|
||||
await self._insert_pending(client, rows)
|
||||
logger.info(f"Added {len(rows)} QCWY companies to pending.")
|
||||
else:
|
||||
logger.info("No new QCWY companies found after filtering.")
|
||||
# 先从 MySQL 取出该平台所有已入队/已入库的 company_id,Python 侧快速排除
|
||||
all_queued = set(await CompanyCleaningQueue.filter(source=source).values_list("company_id", flat=True))
|
||||
all_existing = await company_storage.get_all_company_ids(source)
|
||||
exclude_ids = all_queued | all_existing
|
||||
logger.info(f"Loaded {len(exclude_ids)} known {source} company IDs for exclusion")
|
||||
|
||||
async def _collect_boss(self, client, limit: int):
|
||||
logger.info("Collecting Boss companies...")
|
||||
# 优化:先获取已存在的公司ID,避免在子查询中读取json_data
|
||||
# 使用PREWHERE提前过滤时间范围,减少需要读取的数据量
|
||||
# 检查90天内已处理的公司,避免重复请求
|
||||
days_back_existing = 90 # 查询最近90天的数据,避免重复请求已处理过的公司
|
||||
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
|
||||
FROM job_data.boss_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'brandId') != ''
|
||||
LIMIT 50000
|
||||
"""
|
||||
|
||||
# 添加重试机制
|
||||
existing_result = None
|
||||
existing_cids = set() # 默认使用空集合
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.info(f"Querying existing Boss companies (attempt {attempt+1})...")
|
||||
existing_result = await client.query(existing_companies_query)
|
||||
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
|
||||
break
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt == 0:
|
||||
days_back_existing = 1
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
|
||||
FROM job_data.boss_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'brandId') != ''
|
||||
"""
|
||||
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
|
||||
elif attempt == 1:
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
|
||||
FROM job_data.boss_company SAMPLE 0.1
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'brandId') != ''
|
||||
"""
|
||||
logger.warning(f"Memory error persists, using SAMPLE 0.1")
|
||||
else:
|
||||
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
|
||||
logger.warning("Using empty set for existing_cids, continuing with collection...")
|
||||
existing_cids = set()
|
||||
break
|
||||
else:
|
||||
logger.error(f"Non-memory error while querying existing companies: {e}")
|
||||
raise
|
||||
|
||||
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'boss'"
|
||||
pending_result = await client.query(pending_query)
|
||||
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
|
||||
|
||||
# 构建排除列表
|
||||
exclude_cids = existing_cids | pending_cids
|
||||
|
||||
# 优化:添加时间范围过滤,只查询最近30天的数据,减少扫描量
|
||||
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
|
||||
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'brandId') as cid,
|
||||
JSONExtractString(json_data, 'brandName') as cname
|
||||
FROM job_data.boss_job
|
||||
PREWHERE created_at > now() - INTERVAL 30 DAY
|
||||
WHERE json_data != ''
|
||||
AND JSONExtractString(json_data, 'brandId') != ''
|
||||
LIMIT {limit * 2}
|
||||
"""
|
||||
logger.info(f"Executing SQL for Boss (limit={limit * 2}): {query[:500]}...")
|
||||
result = await client.query(query)
|
||||
if not result.result_rows:
|
||||
return
|
||||
|
||||
# 在 Python 中过滤掉已存在的和待处理的
|
||||
rows: List[Dict[str, Any]] = []
|
||||
for cid, cname in result.result_rows:
|
||||
if not cid or cid in exclude_cids:
|
||||
continue
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
rows.append(
|
||||
{
|
||||
"source": "boss",
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": "pending",
|
||||
"created_at": datetime.now(),
|
||||
"updated_at": datetime.now(),
|
||||
}
|
||||
)
|
||||
await self._insert_pending(client, rows)
|
||||
logger.info(f"Added {len(rows)} Boss companies to pending.")
|
||||
|
||||
async def _insert_pending(self, client, rows: List[Dict[str, Any]]):
|
||||
if not rows:
|
||||
return
|
||||
data: List[List[Any]] = []
|
||||
for r in rows:
|
||||
data.append(
|
||||
[
|
||||
r["source"],
|
||||
r["company_id"],
|
||||
r["company_name"],
|
||||
r["status"],
|
||||
"",
|
||||
r["created_at"],
|
||||
r["updated_at"],
|
||||
1,
|
||||
]
|
||||
)
|
||||
await client.insert(
|
||||
"job_data.pending_company",
|
||||
data,
|
||||
column_names=[
|
||||
"source",
|
||||
"company_id",
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"version",
|
||||
],
|
||||
# 用 OFFSET 跳过已知公司数量,获取新公司
|
||||
offset = len(exclude_ids)
|
||||
result = await self._query_candidate_rows(
|
||||
client=client,
|
||||
table=config["job_table"],
|
||||
company_id_expr=config["company_id_expr"],
|
||||
company_name_expr=config["company_name_expr"],
|
||||
days_back=config["days_back"],
|
||||
limit=limit,
|
||||
max_query_limit=config["max_query_limit"],
|
||||
offset=offset,
|
||||
)
|
||||
if not result.result_rows:
|
||||
logger.info(f"No new {source} companies found in job table query.")
|
||||
return {
|
||||
"source": source,
|
||||
"query_count": 0,
|
||||
"deduped_count": 0,
|
||||
"existing_count": len(all_existing),
|
||||
"queued_count": len(all_queued),
|
||||
"created_count": 0,
|
||||
}
|
||||
|
||||
deduped_candidates: list[dict[str, str]] = []
|
||||
seen_ids: set[str] = set()
|
||||
for raw_company_id, company_name in result.result_rows:
|
||||
company_id = normalize_company_id(source, raw_company_id)
|
||||
if not company_id or company_id in seen_ids or company_id in exclude_ids:
|
||||
continue
|
||||
seen_ids.add(company_id)
|
||||
deduped_candidates.append(
|
||||
{
|
||||
"company_id": company_id,
|
||||
"company_name": (company_name or "").strip(),
|
||||
}
|
||||
)
|
||||
if len(deduped_candidates) >= limit:
|
||||
break
|
||||
|
||||
created_count = await company_storage.enqueue_companies(source, deduped_candidates)
|
||||
logger.info(f"Added {created_count} {source} companies to MySQL queue.")
|
||||
return {
|
||||
"source": source,
|
||||
"query_count": len(result.result_rows),
|
||||
"deduped_count": len(deduped_candidates),
|
||||
"existing_count": len(all_existing),
|
||||
"queued_count": len(all_queued),
|
||||
"created_count": created_count,
|
||||
}
|
||||
|
||||
async def _query_candidate_rows(
|
||||
self,
|
||||
*,
|
||||
client,
|
||||
table: str,
|
||||
company_id_expr: str,
|
||||
company_name_expr: str,
|
||||
days_back: int,
|
||||
limit: int,
|
||||
max_query_limit: Optional[int],
|
||||
offset: int = 0,
|
||||
):
|
||||
current_days = days_back
|
||||
current_limit = limit * 5
|
||||
if max_query_limit is not None:
|
||||
current_limit = min(current_limit, max_query_limit)
|
||||
|
||||
last_error: Optional[Exception] = None
|
||||
for attempt in range(3):
|
||||
sample_sql = " SAMPLE 0.1" if attempt == 2 else ""
|
||||
current_offset = offset
|
||||
if attempt == 1:
|
||||
current_days = max(1, min(current_days, 3))
|
||||
current_limit = min(current_limit, max(limit, 50))
|
||||
current_offset = 0
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
{company_id_expr} AS cid,
|
||||
{company_name_expr} AS cname
|
||||
FROM job_data.{table}{sample_sql}
|
||||
PREWHERE created_at > now() - INTERVAL {current_days} DAY
|
||||
AND json_data != ''
|
||||
WHERE {company_id_expr} != ''
|
||||
LIMIT {current_limit} OFFSET {current_offset}
|
||||
"""
|
||||
try:
|
||||
logger.info(
|
||||
f"Querying company candidates from {table} "
|
||||
f"(days={current_days}, limit={current_limit}, attempt={attempt + 1})"
|
||||
)
|
||||
return await client.query(query)
|
||||
except Exception as exc:
|
||||
last_error = exc
|
||||
error_str = str(exc).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
logger.warning(f"Memory-sensitive query retry for {table}: {exc}")
|
||||
continue
|
||||
raise
|
||||
assert last_error is not None
|
||||
raise last_error
|
||||
|
||||
async def process_single_company(
|
||||
self,
|
||||
@ -455,73 +210,47 @@ class CompanyCleaner:
|
||||
company_id: str,
|
||||
proxy: Optional[str] = None,
|
||||
max_delay_seconds: int = 5,
|
||||
) -> Dict[str, Any]:
|
||||
client = await clickhouse_manager.get_client()
|
||||
) -> dict[str, Any]:
|
||||
normalized_id = normalize_company_id(source, company_id)
|
||||
queue, _ = await company_storage.enqueue_company(source, normalized_id)
|
||||
if proxy:
|
||||
self._apply_proxy(proxy)
|
||||
delay = 0
|
||||
if max_delay_seconds and max_delay_seconds > 0:
|
||||
delay = random.randint(1, max_delay_seconds)
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
query = f"""
|
||||
SELECT source, company_id, company_name, version
|
||||
FROM job_data.pending_company
|
||||
FINAL
|
||||
WHERE source = '{source}' AND company_id = '{company_id}'
|
||||
ORDER BY version DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await client.query(query)
|
||||
if result.result_rows:
|
||||
source_value, cid, cname, version = result.result_rows[0]
|
||||
else:
|
||||
source_value = source
|
||||
cid = company_id
|
||||
cname = ""
|
||||
version = 0
|
||||
await asyncio.sleep(random.randint(1, max_delay_seconds))
|
||||
|
||||
await company_storage.mark_queue_processing(queue)
|
||||
try:
|
||||
success = await self._fetch_and_save(source_value, cid)
|
||||
status = "done" if success else "failed"
|
||||
error_msg = "" if success else "Fetch failed"
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {source_value} {cid}: {e}")
|
||||
status = "failed"
|
||||
error_msg = str(e)
|
||||
await client.insert(
|
||||
"job_data.pending_company",
|
||||
[
|
||||
[
|
||||
source_value,
|
||||
cid,
|
||||
cname,
|
||||
status,
|
||||
error_msg.replace("'", "''"),
|
||||
datetime.now(),
|
||||
datetime.now(),
|
||||
int(version) + 1,
|
||||
]
|
||||
],
|
||||
column_names=[
|
||||
"source",
|
||||
"company_id",
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"version",
|
||||
],
|
||||
)
|
||||
return {
|
||||
"success": status == "done",
|
||||
"source": source_value,
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": status,
|
||||
"error_msg": error_msg,
|
||||
"version": int(version) + 1,
|
||||
}
|
||||
persist_result = await self._fetch_and_save(source, normalized_id)
|
||||
jobs_result = await self._sync_company_jobs(source, normalized_id)
|
||||
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
|
||||
queue.company_name = persist_result["company_name"]
|
||||
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
|
||||
return {
|
||||
"success": True,
|
||||
"source": source,
|
||||
"company_id": normalized_id,
|
||||
"company_name": persist_result["company_name"],
|
||||
"status": "done",
|
||||
"error_msg": "",
|
||||
"created": persist_result["created"],
|
||||
"jobs_summary": jobs_result,
|
||||
}
|
||||
except Exception as exc:
|
||||
logger.error(f"Error processing {source} {normalized_id}: {exc}")
|
||||
await company_storage.mark_queue_result(
|
||||
queue,
|
||||
status="failed",
|
||||
error_msg=str(exc),
|
||||
increment_retry=True,
|
||||
)
|
||||
return {
|
||||
"success": False,
|
||||
"source": source,
|
||||
"company_id": normalized_id,
|
||||
"company_name": queue.company_name or "",
|
||||
"status": "failed",
|
||||
"error_msg": str(exc),
|
||||
}
|
||||
|
||||
async def process_pending_companies(
|
||||
self,
|
||||
@ -530,116 +259,81 @@ class CompanyCleaner:
|
||||
proxy: Optional[str] = None,
|
||||
max_delay_seconds: int = 0,
|
||||
):
|
||||
client = await clickhouse_manager.get_client()
|
||||
logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
|
||||
if proxy:
|
||||
self._apply_proxy(proxy)
|
||||
where_clause = "WHERE status = 'pending'"
|
||||
|
||||
query = CompanyCleaningQueue.filter(status="pending")
|
||||
if source:
|
||||
where_clause += f" AND source = '{source}'"
|
||||
query = f"""
|
||||
SELECT source, company_id, company_name, version
|
||||
FROM job_data.pending_company
|
||||
FINAL
|
||||
{where_clause}
|
||||
ORDER BY created_at ASC
|
||||
LIMIT {limit}
|
||||
"""
|
||||
result = await client.query(query)
|
||||
if not result.result_rows:
|
||||
query = query.filter(source=source)
|
||||
queue_rows = await query.order_by("created_at").limit(limit)
|
||||
if not queue_rows:
|
||||
logger.info("No pending companies to process.")
|
||||
return
|
||||
for source_value, cid, cname, version in result.result_rows:
|
||||
logger.info(f"Processing {source_value} company: {cname} ({cid})")
|
||||
|
||||
for queue in queue_rows:
|
||||
logger.info(f"Processing {queue.source} company: {queue.company_name or ''} ({queue.company_id})")
|
||||
await company_storage.mark_queue_processing(queue)
|
||||
try:
|
||||
if max_delay_seconds and max_delay_seconds > 0:
|
||||
delay = random.randint(1, max_delay_seconds)
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
success = await self._fetch_and_save(source_value, cid)
|
||||
status = "done" if success else "failed"
|
||||
error_msg = "" if success else "Fetch failed"
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {source_value} {cid}: {e}")
|
||||
status = "failed"
|
||||
error_msg = str(e)
|
||||
await client.insert(
|
||||
"job_data.pending_company",
|
||||
[
|
||||
[
|
||||
source_value,
|
||||
cid,
|
||||
cname,
|
||||
status,
|
||||
error_msg.replace("'", "''"),
|
||||
datetime.now(),
|
||||
datetime.now(),
|
||||
int(version) + 1,
|
||||
]
|
||||
],
|
||||
column_names=[
|
||||
"source",
|
||||
"company_id",
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"version",
|
||||
],
|
||||
)
|
||||
await asyncio.sleep(random.randint(1, max_delay_seconds))
|
||||
persist_result = await self._fetch_and_save(queue.source, queue.company_id)
|
||||
jobs_result = await self._sync_company_jobs(queue.source, queue.company_id)
|
||||
logger.info(
|
||||
f"Synced {queue.source} company jobs: "
|
||||
f"fetched={jobs_result['jobs_fetched']} stored={jobs_result['stored_success']} "
|
||||
f"duplicate={jobs_result['duplicate']} failed={jobs_result['failed']}"
|
||||
)
|
||||
if persist_result["company_name"] and queue.company_name != persist_result["company_name"]:
|
||||
queue.company_name = persist_result["company_name"]
|
||||
await company_storage.mark_queue_result(queue, status="done", jobs_summary=jobs_result)
|
||||
except Exception as exc:
|
||||
logger.error(f"Error processing {queue.source} {queue.company_id}: {exc}")
|
||||
await company_storage.mark_queue_result(
|
||||
queue,
|
||||
status="failed",
|
||||
error_msg=str(exc),
|
||||
increment_retry=True,
|
||||
)
|
||||
|
||||
async def _fetch_and_save(self, source: str, company_id: str) -> bool:
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
target_table = ""
|
||||
if source == "zhilian":
|
||||
data = self.zhilian_service.get_company_detail(company_id)
|
||||
target_table = "zhilian_company"
|
||||
elif source == "qcwy":
|
||||
data = self.qcwy_service.get_company_info(company_id)
|
||||
target_table = "qcwy_company"
|
||||
elif source == "boss":
|
||||
await self._ensure_boss_token_loaded()
|
||||
data = self.boss_service.get_company_detail_by_id(company_id)
|
||||
target_table = "boss_company"
|
||||
async def _fetch_and_save(self, source: str, company_id: str) -> dict[str, Any]:
|
||||
data = await self._fetch_company_data(source, company_id)
|
||||
if not data:
|
||||
logger.error(f"No data returned from source={source} company_id={company_id}")
|
||||
return False
|
||||
try:
|
||||
logger.info(
|
||||
f"Raw company data from source={source} company_id={company_id}: "
|
||||
f"{json.dumps(data, ensure_ascii=False)[:2000]}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to log raw company data for source={source} company_id={company_id}: {e}")
|
||||
client = await clickhouse_manager.get_client()
|
||||
name = ""
|
||||
if source == "zhilian":
|
||||
name = data.get("companyBase", {}).get("companyName", "")
|
||||
elif source == "qcwy":
|
||||
name = data.get("companyName", "")
|
||||
elif source == "boss":
|
||||
name = data.get("name", "")
|
||||
json_str = json.dumps(data, ensure_ascii=False)
|
||||
await client.insert(
|
||||
f"job_data.{target_table}",
|
||||
[[0, json_str, name, datetime.now(), datetime.now()]],
|
||||
column_names=["id", "json_data", "company_name", "created_at", "updated_at"],
|
||||
)
|
||||
return True
|
||||
raise ValueError(f"No data returned from source={source} company_id={company_id}")
|
||||
return await company_storage.upsert_company(source, data, company_id=company_id)
|
||||
|
||||
async def _sync_company_jobs(self, source: str, company_id: str) -> dict[str, Any]:
|
||||
try:
|
||||
return await self.company_jobs_sync.sync_company_jobs(source, company_id)
|
||||
except Exception as exc:
|
||||
logger.warning(f"Sync company jobs failed for {source} {company_id}: {exc}")
|
||||
return {
|
||||
"success": False,
|
||||
"source": source,
|
||||
"company_id": company_id,
|
||||
"jobs_fetched": 0,
|
||||
"stored_success": 0,
|
||||
"duplicate": 0,
|
||||
"failed": 0,
|
||||
"error": str(exc),
|
||||
}
|
||||
|
||||
async def _fetch_company_data(self, source: str, company_id: str) -> dict[str, Any]:
|
||||
if source == "zhilian":
|
||||
data = await asyncio.to_thread(self.zhilian_service.get_company_detail, company_id)
|
||||
return data or {}
|
||||
if source == "qcwy":
|
||||
data = await asyncio.to_thread(self.qcwy_service.get_company_info, company_id)
|
||||
return data or {}
|
||||
if source == "boss":
|
||||
await self._ensure_boss_token_loaded()
|
||||
data = await asyncio.to_thread(self.boss_service.get_company_detail_by_id, company_id)
|
||||
return data or {}
|
||||
raise ValueError(f"unsupported source: {source}")
|
||||
|
||||
async def cleanup_old_records(self):
|
||||
""" 清理已完成或失败的记录 (每日调用) """
|
||||
client = await clickhouse_manager.get_client()
|
||||
logger.info("Starting cleanup of processed pending companies...")
|
||||
|
||||
# ClickHouse mutations are async, but lightweight for this purpose
|
||||
query = "ALTER TABLE job_data.pending_company DELETE WHERE status IN ('done', 'failed')"
|
||||
try:
|
||||
await client.command(query)
|
||||
logger.info("Cleanup command executed successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup failed: {e}")
|
||||
await CompanyCleaningQueue.filter(status__in=["done", "failed"]).delete()
|
||||
|
||||
|
||||
company_cleaner = CompanyCleaner()
|
||||
|
||||
355
app/services/company_storage.py
Normal file
355
app/services/company_storage.py
Normal file
@ -0,0 +1,355 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, Iterable, Optional, Type
|
||||
|
||||
from app.models.company import (
|
||||
BaseCompanyModel,
|
||||
BossCompany,
|
||||
CompanyCleaningQueue,
|
||||
QcwyCompany,
|
||||
ZhilianCompany,
|
||||
)
|
||||
|
||||
|
||||
COMPANY_SOURCES = {"boss", "qcwy", "zhilian"}
|
||||
QUEUE_TERMINAL_STATUSES = {"done", "failed"}
|
||||
|
||||
|
||||
def normalize_company_id(source: str, company_id: str) -> str:
|
||||
value = str(company_id or "").strip()
|
||||
if source == "qcwy" and value.lower().startswith("co") and value[2:].isdigit():
|
||||
return value[2:]
|
||||
return value
|
||||
|
||||
|
||||
def _pick_first(data: dict[str, Any], *keys: str) -> Optional[Any]:
|
||||
for key in keys:
|
||||
value = data.get(key)
|
||||
if value not in (None, ""):
|
||||
return value
|
||||
return None
|
||||
|
||||
|
||||
def _nested_get(data: dict[str, Any], *path: str) -> Any:
|
||||
current: Any = data
|
||||
for key in path:
|
||||
if not isinstance(current, dict):
|
||||
return None
|
||||
current = current.get(key)
|
||||
return current
|
||||
|
||||
|
||||
def _clean_text(value: Any) -> Optional[str]:
|
||||
if value is None:
|
||||
return None
|
||||
text = str(value).strip()
|
||||
return text or None
|
||||
|
||||
|
||||
def _model_for_source(source: str) -> Type[BaseCompanyModel]:
|
||||
mapping: dict[str, Type[BaseCompanyModel]] = {
|
||||
"boss": BossCompany,
|
||||
"qcwy": QcwyCompany,
|
||||
"zhilian": ZhilianCompany,
|
||||
}
|
||||
if source not in mapping:
|
||||
raise ValueError(f"unsupported source: {source}")
|
||||
return mapping[source]
|
||||
|
||||
|
||||
def _extract_boss_fields(raw: dict[str, Any], company_id: str) -> dict[str, Any]:
|
||||
payload = raw.get("zpData") if isinstance(raw.get("zpData"), dict) else raw
|
||||
brand = payload.get("brandComInfoVO") or {}
|
||||
company_full = payload.get("companyFullInfoVO") or {}
|
||||
|
||||
return {
|
||||
"source_company_id": normalize_company_id("boss", company_id or _pick_first(brand, "encryptBrandId", "brandId")),
|
||||
"company_name": _clean_text(
|
||||
_pick_first(payload, "name")
|
||||
or _pick_first(company_full, "name", "brandName")
|
||||
or _pick_first(brand, "brandName")
|
||||
) or "",
|
||||
"company_type": _clean_text(_pick_first(company_full, "typeName") or _pick_first(brand, "brandIndustry")),
|
||||
"industry": _clean_text(_pick_first(brand, "industryName") or _pick_first(company_full, "industry")),
|
||||
"company_size": _clean_text(_pick_first(brand, "scaleName") or _pick_first(company_full, "scaleName")),
|
||||
"financing_stage": _clean_text(_pick_first(brand, "stageName") or _pick_first(company_full, "stageName")),
|
||||
"city": _clean_text(_pick_first(company_full, "cityName", "city")),
|
||||
"address": _clean_text(_pick_first(company_full, "address", "addressInfo")),
|
||||
"website": _clean_text(_pick_first(company_full, "website")),
|
||||
"logo_url": _clean_text(_pick_first(company_full, "logo", "brandLogo") or _pick_first(brand, "logo", "brandLogo")),
|
||||
"description": _clean_text(
|
||||
_pick_first(company_full, "introduce", "introduction", "companyDesc")
|
||||
or _pick_first(brand, "introduce")
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _extract_qcwy_fields(raw: dict[str, Any], company_id: str) -> dict[str, Any]:
|
||||
financing = raw.get("financingStage") or {}
|
||||
coinfo = raw.get("coinfo") if isinstance(raw.get("coinfo"), dict) else {}
|
||||
|
||||
return {
|
||||
"source_company_id": normalize_company_id(
|
||||
"qcwy",
|
||||
company_id or _pick_first(raw, "companyId", "coId") or _nested_get(raw, "coinfo", "coid"),
|
||||
),
|
||||
"company_name": _clean_text(
|
||||
_pick_first(raw, "companyName", "fullCompanyName", "companyNameEn")
|
||||
or _pick_first(coinfo, "coname", "brandName")
|
||||
) or "",
|
||||
"company_type": _clean_text(_pick_first(raw, "companyTypeString", "orgTypeName") or _pick_first(coinfo, "cotype")),
|
||||
"industry": _clean_text(
|
||||
_pick_first(raw, "industryName", "companyIndustryType1Str")
|
||||
or _pick_first(coinfo, "indtype1", "indtype2", "coIndustryText")
|
||||
),
|
||||
"company_size": _clean_text(
|
||||
_pick_first(raw, "companySizeString", "companySize", "orgSizeName")
|
||||
or _pick_first(coinfo, "cosize")
|
||||
),
|
||||
"financing_stage": _clean_text(_pick_first(financing, "name") or _pick_first(raw, "financingStageName")),
|
||||
"city": _clean_text(_pick_first(raw, "cityName", "jobAreaString", "workCity") or _pick_first(coinfo, "areaString")),
|
||||
"address": _clean_text(
|
||||
_pick_first(raw, "address", "location")
|
||||
or _nested_get(raw, "workLocation", "workAddress")
|
||||
or _pick_first(coinfo, "caddr")
|
||||
),
|
||||
"website": _clean_text(_pick_first(raw, "companyUrl", "companyHref") or _pick_first(coinfo, "webUrl")),
|
||||
"logo_url": _clean_text(_pick_first(raw, "companyLogo") or _pick_first(coinfo, "logourl")),
|
||||
"description": _clean_text(
|
||||
_pick_first(raw, "companyDesc", "company_desc", "description")
|
||||
or _nested_get(raw, "campusRootOrgInfo", "description")
|
||||
or _pick_first(coinfo, "coinfo")
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _extract_zhilian_fields(raw: dict[str, Any], company_id: str) -> dict[str, Any]:
|
||||
data = raw.get("data") if isinstance(raw.get("data"), dict) else raw
|
||||
company_base = data.get("companyBase") or {}
|
||||
detailed_company = data.get("detailedCompany") or {}
|
||||
|
||||
return {
|
||||
"source_company_id": normalize_company_id(
|
||||
"zhilian",
|
||||
company_id
|
||||
or _pick_first(company_base, "companyNumber", "number")
|
||||
or _pick_first(detailed_company, "companyNumber", "number"),
|
||||
),
|
||||
"company_name": _clean_text(_pick_first(company_base, "companyName") or _pick_first(data, "companyName")) or "",
|
||||
"company_type": _clean_text(
|
||||
_pick_first(company_base, "companyTypeName", "companyType")
|
||||
or _pick_first(detailed_company, "companyTypeName")
|
||||
),
|
||||
"industry": _clean_text(_pick_first(company_base, "industryName") or _pick_first(detailed_company, "industryName")),
|
||||
"company_size": _clean_text(
|
||||
_pick_first(company_base, "companySize", "companySizeString")
|
||||
or _pick_first(detailed_company, "companySize")
|
||||
),
|
||||
"financing_stage": _clean_text(
|
||||
_pick_first(company_base, "financingStageName")
|
||||
or _nested_get(company_base, "financingStage", "name")
|
||||
or _nested_get(detailed_company, "financingStage", "name")
|
||||
),
|
||||
"city": _clean_text(_pick_first(company_base, "cityName") or _pick_first(detailed_company, "cityName")),
|
||||
"address": _clean_text(_pick_first(company_base, "address") or _pick_first(detailed_company, "address")),
|
||||
"website": _clean_text(_pick_first(company_base, "companyUrl", "website")),
|
||||
"logo_url": _clean_text(_pick_first(company_base, "logoUrl", "companyLogo")),
|
||||
"description": _clean_text(
|
||||
_pick_first(company_base, "companyDescWithHtml", "companyDesc")
|
||||
or _pick_first(detailed_company, "companyDescription", "companyDesc")
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def extract_company_fields(source: str, raw: dict[str, Any], company_id: str) -> dict[str, Any]:
|
||||
if source == "boss":
|
||||
return _extract_boss_fields(raw, company_id)
|
||||
if source == "qcwy":
|
||||
return _extract_qcwy_fields(raw, company_id)
|
||||
if source == "zhilian":
|
||||
return _extract_zhilian_fields(raw, company_id)
|
||||
raise ValueError(f"unsupported source: {source}")
|
||||
|
||||
|
||||
class CompanyStorageService:
|
||||
@staticmethod
|
||||
def company_model(source: str) -> Type[BaseCompanyModel]:
|
||||
return _model_for_source(source)
|
||||
|
||||
async def get_existing_company_ids(self, source: str, company_ids: Iterable[str]) -> set[str]:
|
||||
normalized_ids = [normalize_company_id(source, item) for item in company_ids if item]
|
||||
if not normalized_ids:
|
||||
return set()
|
||||
model = self.company_model(source)
|
||||
rows = await model.filter(source_company_id__in=normalized_ids).values_list("source_company_id", flat=True)
|
||||
return set(rows)
|
||||
|
||||
async def get_all_company_ids(self, source: str) -> set[str]:
|
||||
"""获取该平台所有已入库的公司 ID(用于 ClickHouse 查询排除)"""
|
||||
model = self.company_model(source)
|
||||
rows = await model.all().values_list("source_company_id", flat=True)
|
||||
return set(rows)
|
||||
|
||||
async def get_existing_queue_ids(self, source: str, company_ids: Iterable[str]) -> set[str]:
|
||||
normalized_ids = [normalize_company_id(source, item) for item in company_ids if item]
|
||||
if not normalized_ids:
|
||||
return set()
|
||||
rows = await CompanyCleaningQueue.filter(source=source, company_id__in=normalized_ids).values_list("company_id", flat=True)
|
||||
return set(rows)
|
||||
|
||||
async def enqueue_company(self, source: str, company_id: str, company_name: str = "") -> tuple[CompanyCleaningQueue, bool]:
|
||||
normalized_id = normalize_company_id(source, company_id)
|
||||
defaults = {
|
||||
"company_name": company_name or "",
|
||||
"status": "pending",
|
||||
"error_msg": "",
|
||||
"retry_count": 0,
|
||||
"started_at": None,
|
||||
"finished_at": None,
|
||||
"jobs_fetched": 0,
|
||||
"jobs_stored": 0,
|
||||
"jobs_duplicate": 0,
|
||||
"jobs_failed": 0,
|
||||
"jobs_error_msg": "",
|
||||
}
|
||||
queue, created = await CompanyCleaningQueue.get_or_create(
|
||||
source=source,
|
||||
company_id=normalized_id,
|
||||
defaults=defaults,
|
||||
)
|
||||
if not created and company_name and queue.company_name != company_name:
|
||||
queue.company_name = company_name
|
||||
await queue.save(update_fields=["company_name", "updated_at"])
|
||||
return queue, created
|
||||
|
||||
async def enqueue_companies(self, source: str, companies: Iterable[dict[str, str]]) -> int:
|
||||
created_count = 0
|
||||
for item in companies:
|
||||
_, created = await self.enqueue_company(
|
||||
source=source,
|
||||
company_id=item.get("company_id", ""),
|
||||
company_name=item.get("company_name", "") or "",
|
||||
)
|
||||
if created:
|
||||
created_count += 1
|
||||
return created_count
|
||||
|
||||
async def get_company_record(self, source: str, company_id: str) -> Optional[BaseCompanyModel]:
|
||||
normalized_id = normalize_company_id(source, company_id)
|
||||
model = self.company_model(source)
|
||||
return await model.get_or_none(source_company_id=normalized_id)
|
||||
|
||||
async def upsert_company(
|
||||
self,
|
||||
source: str,
|
||||
raw_data: dict[str, Any],
|
||||
*,
|
||||
company_id: Optional[str] = None,
|
||||
) -> dict[str, Any]:
|
||||
normalized_id = normalize_company_id(source, company_id or "")
|
||||
fields = extract_company_fields(source, raw_data, normalized_id)
|
||||
normalized_id = fields["source_company_id"]
|
||||
if not normalized_id:
|
||||
raise ValueError(f"missing normalized company id for source={source}")
|
||||
if not fields["company_name"]:
|
||||
raise ValueError(f"missing company name for source={source} company_id={normalized_id}")
|
||||
|
||||
model = self.company_model(source)
|
||||
record = await model.get_or_none(source_company_id=normalized_id)
|
||||
now = datetime.now()
|
||||
payload = {
|
||||
**fields,
|
||||
"raw_json": raw_data,
|
||||
"last_crawled_at": now,
|
||||
}
|
||||
|
||||
if record:
|
||||
for key, value in payload.items():
|
||||
setattr(record, key, value)
|
||||
await record.save()
|
||||
created = False
|
||||
else:
|
||||
record = await model.create(
|
||||
**payload,
|
||||
first_crawled_at=now,
|
||||
)
|
||||
created = True
|
||||
|
||||
return {
|
||||
"success": True,
|
||||
"created": created,
|
||||
"company_id": normalized_id,
|
||||
"company_name": record.company_name,
|
||||
"data_summary": {
|
||||
"source": source,
|
||||
"company_id": normalized_id,
|
||||
"company_name": record.company_name,
|
||||
"created": created,
|
||||
},
|
||||
"record": record,
|
||||
}
|
||||
|
||||
async def mark_queue_processing(self, queue: CompanyCleaningQueue) -> None:
|
||||
queue.status = "processing"
|
||||
queue.error_msg = ""
|
||||
queue.started_at = datetime.now()
|
||||
queue.finished_at = None
|
||||
queue.jobs_fetched = 0
|
||||
queue.jobs_stored = 0
|
||||
queue.jobs_duplicate = 0
|
||||
queue.jobs_failed = 0
|
||||
queue.jobs_error_msg = ""
|
||||
await queue.save(
|
||||
update_fields=[
|
||||
"status",
|
||||
"error_msg",
|
||||
"started_at",
|
||||
"finished_at",
|
||||
"jobs_fetched",
|
||||
"jobs_stored",
|
||||
"jobs_duplicate",
|
||||
"jobs_failed",
|
||||
"jobs_error_msg",
|
||||
"updated_at",
|
||||
]
|
||||
)
|
||||
|
||||
async def mark_queue_result(
|
||||
self,
|
||||
queue: CompanyCleaningQueue,
|
||||
*,
|
||||
status: str,
|
||||
error_msg: str = "",
|
||||
increment_retry: bool = False,
|
||||
jobs_summary: Optional[dict[str, Any]] = None,
|
||||
) -> None:
|
||||
queue.status = status
|
||||
queue.error_msg = error_msg or ""
|
||||
queue.finished_at = datetime.now()
|
||||
if jobs_summary:
|
||||
queue.jobs_fetched = int(jobs_summary.get("jobs_fetched") or 0)
|
||||
queue.jobs_stored = int(jobs_summary.get("stored_success") or 0)
|
||||
queue.jobs_duplicate = int(jobs_summary.get("duplicate") or 0)
|
||||
queue.jobs_failed = int(jobs_summary.get("failed") or 0)
|
||||
queue.jobs_error_msg = jobs_summary.get("error") or ""
|
||||
if increment_retry:
|
||||
queue.retry_count += 1
|
||||
await queue.save(
|
||||
update_fields=[
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"retry_count",
|
||||
"finished_at",
|
||||
"jobs_fetched",
|
||||
"jobs_stored",
|
||||
"jobs_duplicate",
|
||||
"jobs_failed",
|
||||
"jobs_error_msg",
|
||||
"updated_at",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
company_storage = CompanyStorageService()
|
||||
@ -1,3 +1,3 @@
|
||||
from .boss import BossService
|
||||
from .qcwy import QcwyService
|
||||
from .zhilian import ZhilianService
|
||||
from .boss import BossService as BossService
|
||||
from .qcwy import QcwyService as QcwyService
|
||||
from .zhilian import ZhilianService as ZhilianService
|
||||
|
||||
@ -12,10 +12,33 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from crawler_core.base import BaseFetcher, BaseSearcher
|
||||
from crawler_core.base import BaseFetcher, BaseSearcher, Result
|
||||
from app.services.crawler._zhilian_client import ZhilianClient, create_cgate_client, create_capi_client
|
||||
|
||||
|
||||
def _parse_zhilian_response(http_code: int, raw) -> Result:
|
||||
"""智联专用响应解析"""
|
||||
if http_code != 200:
|
||||
return Result(success=False, status_code=http_code,
|
||||
error=f"HTTP 请求失败: {http_code}")
|
||||
if not isinstance(raw, dict):
|
||||
return Result(success=False, status_code=http_code, error="响应格式异常")
|
||||
|
||||
payload = raw.get("data") or {}
|
||||
|
||||
# 列表型响应
|
||||
if isinstance(payload, dict) and "list" in payload:
|
||||
items = payload.get("list", [])
|
||||
num_found = raw.get("pageInfo", {}).get("numFound", 0) or payload.get("numFound", len(items))
|
||||
return Result(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=items, count=num_found,
|
||||
is_end_page=len(items) == 0,
|
||||
)
|
||||
|
||||
return Result(success=True, status_code=200, data=payload)
|
||||
|
||||
|
||||
_SEARCH_BODY = {
|
||||
"eventScenario": "wxmpZhaopinSearchV2",
|
||||
"filterMinSalary": 1,
|
||||
@ -49,6 +72,9 @@ class SearchPositions(BaseSearcher):
|
||||
self.collected_purpose = collected_purpose
|
||||
self.filters = filters or {}
|
||||
|
||||
def _parse(self, http_code: int, raw) -> Result:
|
||||
return _parse_zhilian_response(http_code, raw)
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
body = {**_SEARCH_BODY, "pageIndex": page_index, "pageSize": self.page_size}
|
||||
if self.collected_purpose:
|
||||
@ -90,6 +116,9 @@ class GetPositionDetail(BaseFetcher):
|
||||
def _build_params(self) -> dict:
|
||||
return {"number": self.number, "identity": self.identity, "resumeNumber": ""}
|
||||
|
||||
def _parse(self, http_code: int, raw) -> Result:
|
||||
return _parse_zhilian_response(http_code, raw)
|
||||
|
||||
|
||||
class GetCompanyExtDetail(BaseFetcher):
|
||||
ENDPOINT = "/riskstorm/company/getCompanyExtDetail"
|
||||
@ -102,6 +131,9 @@ class GetCompanyExtDetail(BaseFetcher):
|
||||
def _build_params(self) -> dict:
|
||||
return {"companyName": self.company_name, "companyNumber": self.company_number}
|
||||
|
||||
def _parse(self, http_code: int, raw) -> Result:
|
||||
return _parse_zhilian_response(http_code, raw)
|
||||
|
||||
|
||||
class GetCompanyDetail(BaseFetcher):
|
||||
ENDPOINT = "/positionbusiness/exposure/companyDetail"
|
||||
@ -113,6 +145,9 @@ class GetCompanyDetail(BaseFetcher):
|
||||
def _build_params(self) -> dict:
|
||||
return {"number": self.number}
|
||||
|
||||
def _parse(self, http_code: int, raw) -> Result:
|
||||
return _parse_zhilian_response(http_code, raw)
|
||||
|
||||
|
||||
class SearchCompanyPositions(BaseSearcher):
|
||||
ENDPOINT = "/capi/searchrecommend/searchPositionsCompany"
|
||||
@ -146,3 +181,6 @@ class SearchCompanyPositions(BaseSearcher):
|
||||
|
||||
def _request(self, params: dict) -> tuple[int, Any]:
|
||||
return self.http_client.get(self.ENDPOINT, params)
|
||||
|
||||
def _parse(self, http_code: int, raw) -> Result:
|
||||
return _parse_zhilian_response(http_code, raw)
|
||||
|
||||
5
app/services/ingest/__init__.py
Normal file
5
app/services/ingest/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# 触发所有平台配置注册
|
||||
from app.services.ingest.configs import * # noqa: F401, F403
|
||||
from app.services.ingest.service import IngestService
|
||||
|
||||
__all__ = ["IngestService"]
|
||||
4
app/services/ingest/configs/__init__.py
Normal file
4
app/services/ingest/configs/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
# 导入各平台配置,触发注册
|
||||
from app.services.ingest.configs import boss # noqa: F401
|
||||
from app.services.ingest.configs import qcwy # noqa: F401
|
||||
from app.services.ingest.configs import zhilian # noqa: F401
|
||||
53
app/services/ingest/registry.py
Normal file
53
app/services/ingest/registry.py
Normal file
@ -0,0 +1,53 @@
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Callable, Dict, Any, List, Optional, Tuple
|
||||
|
||||
from app.log import logger
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class DedupFieldSpec:
|
||||
"""去重字段规格:从原始数据中提取去重列值"""
|
||||
column: str
|
||||
extractor: Callable[[Dict[str, Any]], Optional[str]]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class PlatformConfig:
|
||||
"""平台配置(不可变)"""
|
||||
platform: str
|
||||
channel: str
|
||||
data_type: str
|
||||
table: str
|
||||
dedup_fields: Tuple[DedupFieldSpec, ...] = ()
|
||||
push_mapper: Optional[Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]] = None
|
||||
|
||||
@property
|
||||
def key(self) -> Tuple[str, str, str]:
|
||||
return (self.platform, self.channel, self.data_type)
|
||||
|
||||
@property
|
||||
def dedup_columns(self) -> List[str]:
|
||||
return [f.column for f in self.dedup_fields]
|
||||
|
||||
|
||||
# 全局注册表
|
||||
_REGISTRY: Dict[Tuple[str, str, str], PlatformConfig] = {}
|
||||
|
||||
|
||||
def register(config: PlatformConfig) -> None:
|
||||
key = config.key
|
||||
if key in _REGISTRY:
|
||||
logger.warning(f"覆盖已有注册: {key}")
|
||||
_REGISTRY[key] = config
|
||||
|
||||
|
||||
def get_config(platform: str, channel: str, data_type: str) -> PlatformConfig:
|
||||
key = (platform, channel, data_type)
|
||||
config = _REGISTRY.get(key)
|
||||
if config is None:
|
||||
raise ValueError(f"未注册的平台配置: platform={platform}, channel={channel}, data_type={data_type}")
|
||||
return config
|
||||
|
||||
|
||||
def list_configs() -> List[PlatformConfig]:
|
||||
return list(_REGISTRY.values())
|
||||
83
app/services/ingest/remote_push.py
Normal file
83
app/services/ingest/remote_push.py
Normal file
@ -0,0 +1,83 @@
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from app.log import logger
|
||||
|
||||
|
||||
# 同步辅助函数(无 await,纯计算)
|
||||
def safe_get(obj: Optional[Dict], key: str, default: str = "") -> str:
|
||||
if obj is None:
|
||||
return default
|
||||
value = obj.get(key)
|
||||
return str(value) if value is not None else default
|
||||
|
||||
|
||||
def safe_join(data, default: str = "") -> str:
|
||||
if data is None:
|
||||
return default
|
||||
if isinstance(data, list):
|
||||
return ",".join(str(item) for item in data if item)
|
||||
return str(data) if data else default
|
||||
|
||||
|
||||
# 模块级 httpx 单例
|
||||
_http_client: Optional[httpx.AsyncClient] = None
|
||||
|
||||
|
||||
def get_http_client() -> httpx.AsyncClient:
|
||||
global _http_client
|
||||
if _http_client is None or _http_client.is_closed:
|
||||
_http_client = httpx.AsyncClient(timeout=30.0)
|
||||
return _http_client
|
||||
|
||||
|
||||
async def close_http_client() -> None:
|
||||
global _http_client
|
||||
if _http_client is not None and not _http_client.is_closed:
|
||||
await _http_client.aclose()
|
||||
_http_client = None
|
||||
|
||||
|
||||
def _build_auth_url() -> str:
|
||||
from_id = 9910056
|
||||
timestamp = int(time.time())
|
||||
salt = "jWcIqJK6QlR2syb6HQgpel9iOoOkj01G5MDFNtQLaTxhddHUTEnURsMe2RxCTYC8"
|
||||
token = hashlib.md5((salt + str(timestamp)).encode()).hexdigest()
|
||||
return f"http://external-data.qixin.com/extend/extend_data_push?from={from_id}&token={token}&time={timestamp}"
|
||||
|
||||
|
||||
_PUSH_HEADERS = {
|
||||
"Content-Type": "application/json",
|
||||
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
||||
}
|
||||
|
||||
|
||||
async def push_to_remote(data: Dict[str, Any]) -> bool:
|
||||
source_type = data.get("source_type", "未知平台")
|
||||
title = data.get("title", "未知职位")
|
||||
company = data.get("company_name", data.get("name", "未知公司"))
|
||||
logger.info(f"上报数据: [{source_type}] {title} - {company}")
|
||||
print(data)
|
||||
|
||||
# try:
|
||||
# url = _build_auth_url()
|
||||
# client = get_http_client()
|
||||
# response = await client.post(url, json=data, headers=_PUSH_HEADERS)
|
||||
# if response.status_code == 200:
|
||||
# return True
|
||||
# logger.error(f"数据发送失败: {response.status_code} - {response.text[:100]}")
|
||||
# return False
|
||||
# except Exception as e:
|
||||
# logger.error(f"发送异常: {e}")
|
||||
# return False
|
||||
|
||||
|
||||
async def batch_push_to_remote(data_list: List[Dict[str, Any]]) -> None:
|
||||
for data in data_list:
|
||||
try:
|
||||
await push_to_remote(data)
|
||||
except Exception as e:
|
||||
logger.error(f"批量推送单条失败: {e}")
|
||||
@ -1,108 +0,0 @@
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, List, Optional, Tuple
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
|
||||
|
||||
class IngestService:
|
||||
def __init__(self, client: AsyncClient):
|
||||
self.client = client
|
||||
|
||||
def _table_name(self, platform: str, data_type: str) -> str:
|
||||
return f"job_data.{platform}_{data_type}"
|
||||
|
||||
def _build_row(self, platform: str, data_type: str, data: Dict[str, Any]) -> Tuple[List[str], List[Any]]:
|
||||
now = datetime.now()
|
||||
columns = ["id", "json_data", "created_at", "updated_at"]
|
||||
values = [0, json.dumps(data, ensure_ascii=False), now, now]
|
||||
if platform == "boss" and data_type == "job":
|
||||
job_base = data.get("jobBaseInfoVO", {})
|
||||
columns += ["job_id"]
|
||||
values += [str(job_base.get("jobId", ""))]
|
||||
elif platform == "qcwy" and data_type == "job":
|
||||
columns += ["job_id", "update_date_time"]
|
||||
values += [str(data.get("jobId", "")), str(data.get("updateDateTime", ""))]
|
||||
elif platform == "zhilian" and data_type == "job":
|
||||
columns += ["number", "first_publish_time"]
|
||||
values += [str(data.get("number", "")), str(data.get("firstPublishTime", ""))]
|
||||
elif data_type == "company":
|
||||
name = None
|
||||
if platform == "boss":
|
||||
name = data.get("brandComInfoVO", {}).get("brandName") or data.get("name")
|
||||
elif platform == "qcwy":
|
||||
name = data.get("fullCompanyName") or data.get("companyName")
|
||||
elif platform == "zhilian":
|
||||
name = data.get("companyName") or data.get("name")
|
||||
columns += ["company_name"]
|
||||
values += [str(name or "")]
|
||||
return columns, values
|
||||
|
||||
def _dup_conditions(self, platform: str, data_type: str, data: Dict[str, Any]) -> Optional[Tuple[str, List[Any]]]:
|
||||
if platform == "boss" and data_type == "job":
|
||||
job_base = data.get("jobBaseInfoVO", {})
|
||||
job_id = job_base.get("jobId")
|
||||
if not job_id:
|
||||
return None
|
||||
return "job_id = %s", [str(job_id)]
|
||||
if platform == "qcwy" and data_type == "job":
|
||||
job_id = data.get("jobId")
|
||||
update_dt = data.get("updateDateTime")
|
||||
if not job_id or not update_dt:
|
||||
return None
|
||||
return "job_id = %s AND update_date_time = %s", [str(job_id), str(update_dt)]
|
||||
if platform == "zhilian" and data_type == "job":
|
||||
number = data.get("number")
|
||||
fpt = data.get("firstPublishTime")
|
||||
if not number or not fpt:
|
||||
return None
|
||||
return "number = %s AND first_publish_time = %s", [str(number), str(fpt)]
|
||||
if data_type == "company":
|
||||
name = None
|
||||
if platform == "boss":
|
||||
name = data.get("brandComInfoVO", {}).get("brandName") or data.get("name")
|
||||
elif platform == "qcwy":
|
||||
name = data.get("fullCompanyName") or data.get("companyName")
|
||||
elif platform == "zhilian":
|
||||
name = data.get("companyName") or data.get("name")
|
||||
if not name:
|
||||
return None
|
||||
return "company_name = %s", [str(name)]
|
||||
return None
|
||||
|
||||
async def store_single(self, platform: str, data_type: str, data: Dict[str, Any], check_duplicate: bool = True) -> Dict[str, int]:
|
||||
table = self._table_name(platform, data_type)
|
||||
if check_duplicate:
|
||||
cond = self._dup_conditions(platform, data_type, data)
|
||||
if cond:
|
||||
where_sql, params = cond
|
||||
q = f"SELECT 1 FROM {table} WHERE {where_sql} LIMIT 1"
|
||||
r = await self.client.query(q, params)
|
||||
if r.result_rows:
|
||||
return {"inserted": 0, "ignored": 1}
|
||||
cols, vals = self._build_row(platform, data_type, data)
|
||||
await self.client.insert(table, [vals], column_names=cols)
|
||||
return {"inserted": 1, "ignored": 0}
|
||||
|
||||
async def store_batch(self, platform: str, data_type: str, data_list: List[Dict[str, Any]], check_duplicate: bool = True) -> Dict[str, int]:
|
||||
table = self._table_name(platform, data_type)
|
||||
if not data_list:
|
||||
return {"inserted": 0, "ignored": 0}
|
||||
rows: List[List[Any]] = []
|
||||
columns: Optional[List[str]] = None
|
||||
ignored = 0
|
||||
for d in data_list:
|
||||
if check_duplicate:
|
||||
cond = self._dup_conditions(platform, data_type, d)
|
||||
if cond:
|
||||
where_sql, params = cond
|
||||
q = f"SELECT 1 FROM {table} WHERE {where_sql} LIMIT 1"
|
||||
r = await self.client.query(q, params)
|
||||
if r.result_rows:
|
||||
ignored += 1
|
||||
continue
|
||||
cols, vals = self._build_row(platform, data_type, d)
|
||||
columns = columns or cols
|
||||
rows.append(vals)
|
||||
if rows:
|
||||
await self.client.insert(table, rows, column_names=columns)
|
||||
return {"inserted": len(rows), "ignored": ignored}
|
||||
@ -1,971 +0,0 @@
|
||||
import hashlib
|
||||
import time
|
||||
from typing import Dict, Any, Optional, List
|
||||
from enum import Enum
|
||||
import json
|
||||
from datetime import datetime
|
||||
|
||||
import requests
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
from app.log import logger
|
||||
|
||||
from tenacity import retry, stop_after_attempt, wait_exponential
|
||||
|
||||
|
||||
class DataType(str, Enum):
|
||||
"""数据类型枚举"""
|
||||
JOB = "job"
|
||||
COMPANY = "company"
|
||||
|
||||
|
||||
class PlatformType(str, Enum):
|
||||
"""平台类型枚举"""
|
||||
BOSS = "boss"
|
||||
QCWY = "qcwy"
|
||||
ZHILIAN = "zhilian"
|
||||
|
||||
|
||||
class DataRouterService:
|
||||
"""通用数据路由服务 - 根据数据类型和平台自动选择对应的表进行存储"""
|
||||
|
||||
def __init__(self, clickhouse_client: AsyncClient):
|
||||
self.clickhouse_client = clickhouse_client
|
||||
# 移除平台特定仓库引用,改用通用数据接口
|
||||
|
||||
# 安全获取列表数据的辅助函数
|
||||
async def safe_join(self, data, default=""):
|
||||
"""安全地将列表数据转换为逗号分隔的字符串"""
|
||||
if data is None:
|
||||
return default
|
||||
if isinstance(data, list):
|
||||
return ",".join(str(item) for item in data if item)
|
||||
return str(data) if data else default
|
||||
|
||||
# 安全获取字符串数据的辅助函数
|
||||
async def safe_get(self, obj, key, default=""):
|
||||
"""安全地获取字典中的值"""
|
||||
value = obj.get(key) if obj else None
|
||||
return str(value) if value is not None else default
|
||||
|
||||
async def store_data(self,
|
||||
data: Dict[str, Any],
|
||||
data_type: DataType,
|
||||
platform: PlatformType,
|
||||
check_duplicate: bool = True) -> Dict[str, Any]:
|
||||
"""通用数据存储方法 - 使用JSON存储方案
|
||||
|
||||
Args:
|
||||
data: 要存储的数据
|
||||
data_type: 数据类型 (job/company)
|
||||
platform: 平台类型 (boss/qcwy/zhilian)
|
||||
check_duplicate: 是否检查重复数据
|
||||
|
||||
Returns:
|
||||
存储结果信息
|
||||
"""
|
||||
try:
|
||||
return await self._store_data_as_json(data, data_type, platform, check_duplicate)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"{platform} {data_type} 数据存储失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"数据存储失败: {str(e)}",
|
||||
"duplicate": False,
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
def _get_json_table_name(self, data_type: DataType, platform: PlatformType) -> str:
|
||||
"""根据数据类型和平台获取对应的JSON表名"""
|
||||
return f"{platform.value}_{data_type.value}"
|
||||
|
||||
async def _store_data_as_json(self, data: Dict[str, Any], data_type: DataType, platform: PlatformType,
|
||||
check_duplicate: bool = True) -> Dict[str, Any]:
|
||||
"""使用JSON存储方案存储数据"""
|
||||
try:
|
||||
# 获取对应的JSON表名
|
||||
json_table_name = self._get_json_table_name(data_type, platform)
|
||||
remote_data = await self._prepare_remote_push_data(data, data_type, platform)
|
||||
if remote_data:
|
||||
await self.send_to_remote_server(remote_data)
|
||||
|
||||
# QCWY平台重复检查
|
||||
if platform == PlatformType.QCWY and data_type == DataType.JOB:
|
||||
job_id = data.get('jobId')
|
||||
update_date_time = data.get('updateDateTime')
|
||||
if job_id and update_date_time:
|
||||
duplicate_record = await self._check_qcwy_duplicate(json_table_name, job_id, update_date_time)
|
||||
if duplicate_record:
|
||||
logger.info(f"QCWY职位数据重复,跳过插入: jobId={job_id}, updateDateTime={update_date_time}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "数据重复,跳过插入",
|
||||
"duplicate": True,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
# BOSS平台重复检查: JonId
|
||||
if platform == PlatformType.BOSS and data_type == DataType.JOB:
|
||||
job_base_info = data.get('jobBaseInfoVO', {})
|
||||
job_id = job_base_info.get('jobId')
|
||||
|
||||
if job_id:
|
||||
duplicate_record = await self._check_boss_duplicate(json_table_name, job_id)
|
||||
if duplicate_record:
|
||||
logger.info(f"BOSS职位数据重复,跳过插入: jobId={job_id}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "数据重复,跳过插入",
|
||||
"duplicate": True,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
|
||||
# 智联平台重复检查: number + firstPublishTime
|
||||
if platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
|
||||
number = data.get('number')
|
||||
first_publish_time = data.get('firstPublishTime')
|
||||
if number and first_publish_time:
|
||||
duplicate_record = await self._check_zhilian_duplicate(json_table_name, number, first_publish_time)
|
||||
if duplicate_record:
|
||||
logger.info(
|
||||
f"智联职位数据重复,跳过插入: number={number}, firstPublishTime={first_publish_time}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "数据重复,跳过插入",
|
||||
"duplicate": True,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
|
||||
# BOSS平台公司重复检查: 按公司名称
|
||||
if platform == PlatformType.BOSS and data_type == DataType.COMPANY:
|
||||
company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
|
||||
if company_name:
|
||||
duplicate_record = await self._check_boss_company_duplicate(json_table_name, company_name)
|
||||
if duplicate_record:
|
||||
logger.info(f"BOSS公司数据重复,跳过插入: companyName={company_name}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "数据重复,跳过插入",
|
||||
"duplicate": True,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
|
||||
# QCWY平台公司重复检查: 按公司名称
|
||||
if platform == PlatformType.QCWY and data_type == DataType.COMPANY:
|
||||
company_name = data.get('companyName') or data.get('company_name')
|
||||
if company_name:
|
||||
duplicate_record = await self._check_qcwy_company_duplicate(json_table_name, company_name)
|
||||
if duplicate_record:
|
||||
logger.info(f"QCWY公司数据重复,跳过插入: companyName={company_name}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "数据重复,跳过插入",
|
||||
"duplicate": True,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
|
||||
# 智联平台公司重复检查: companyName
|
||||
if platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
|
||||
company_name = data.get('companyName') or data.get('name')
|
||||
if company_name:
|
||||
duplicate_record = await self._check_zhilian_company_duplicate(json_table_name, company_name)
|
||||
if duplicate_record:
|
||||
logger.info(f"智联公司数据重复,跳过插入: companyName={company_name}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "数据重复,跳过插入",
|
||||
"duplicate": True,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
|
||||
# 准备JSON存储数据
|
||||
current_time = datetime.now()
|
||||
json_data = {
|
||||
'id': 0, # <20><>动生成
|
||||
'json_data': json.dumps(data, ensure_ascii=False),
|
||||
'created_at': current_time,
|
||||
'updated_at': current_time
|
||||
}
|
||||
|
||||
# 根据平台和数据类型添加去重字段
|
||||
if platform == PlatformType.BOSS and data_type == DataType.JOB:
|
||||
# BOSS平台职位数据:添加job_id字段
|
||||
job_base_info = data.get('jobBaseInfoVO', {})
|
||||
if job_base_info and 'jobId' in job_base_info:
|
||||
json_data['job_id'] = str(job_base_info['jobId'])
|
||||
|
||||
elif platform == PlatformType.QCWY and data_type == DataType.JOB:
|
||||
# QCWY平台职位数据:添加job_id和update_date_time字段
|
||||
if 'jobId' in data:
|
||||
json_data['job_id'] = str(data['jobId'])
|
||||
if 'updateDateTime' in data:
|
||||
json_data['update_date_time'] = str(data['updateDateTime'])
|
||||
|
||||
elif platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
|
||||
# 智联平台职位数据:添加number和first_publish_time字段
|
||||
if 'number' in data:
|
||||
json_data['number'] = str(data['number'])
|
||||
if 'firstPublishTime' in data:
|
||||
json_data['first_publish_time'] = str(data['firstPublishTime'])
|
||||
|
||||
# 根据平台和数据类型添加公司去重字段
|
||||
elif platform == PlatformType.BOSS and data_type == DataType.COMPANY:
|
||||
# BOSS平台公司数据:添加company_name字段
|
||||
company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
|
||||
if company_name:
|
||||
json_data['company_name'] = str(company_name)
|
||||
|
||||
elif platform == PlatformType.QCWY and data_type == DataType.COMPANY:
|
||||
# QCWY平台公司数据:添加company_name字段
|
||||
company_name = data.get('companyName') or data.get('company_name')
|
||||
if company_name:
|
||||
json_data['company_name'] = str(company_name)
|
||||
|
||||
elif platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
|
||||
# 智联平台公司数据:添加company_name字段
|
||||
company_name = data.get('companyName') or data.get('name')
|
||||
if company_name:
|
||||
json_data['company_name'] = str(company_name)
|
||||
|
||||
# 插入到对应的JSON表
|
||||
await self._insert_data_to_clickhouse(json_table_name, json_data)
|
||||
|
||||
logger.info(f"{platform} {data_type} 数据以JSON格式存储成功到表 {json_table_name}")
|
||||
return {
|
||||
"success": True,
|
||||
"message": "JSON数据存储成功",
|
||||
"duplicate": False,
|
||||
"table_name": json_table_name,
|
||||
"storage_type": "json"
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"JSON数据存储失败: {str(e)}")
|
||||
raise e
|
||||
|
||||
async def query_json_data(self,
|
||||
platform: PlatformType,
|
||||
data_type: DataType,
|
||||
json_fields: Optional[Dict[str, str]] = None,
|
||||
limit: int = 100,
|
||||
offset: int = 0) -> Dict[str, Any]:
|
||||
"""查询JSON存储的数据
|
||||
|
||||
Args:
|
||||
platform: 平台类型 (必需)
|
||||
data_type: 数据类型 (必需)
|
||||
json_fields: 要提取的JSON字段映射 {alias: json_path}
|
||||
limit: 返回记录数限制
|
||||
|
||||
Returns:
|
||||
查询结果
|
||||
"""
|
||||
try:
|
||||
# 获取对应的JSON表名
|
||||
json_table_name = self._get_json_table_name(data_type, platform)
|
||||
|
||||
# 获取总数
|
||||
count_query = f"SELECT count() FROM job_data.{json_table_name}"
|
||||
count_result = await self.clickhouse_client.query(count_query)
|
||||
total_count = count_result.result_rows[0][0] if count_result.result_rows else 0
|
||||
|
||||
# 构建查询
|
||||
if json_fields:
|
||||
select_fields = ['created_at']
|
||||
for alias, json_path in json_fields.items():
|
||||
select_fields.append(f"JSONExtractString(json_data, '{json_path}') as {alias}")
|
||||
query = f"SELECT {', '.join(select_fields)} FROM job_data.{json_table_name}"
|
||||
else:
|
||||
# 如果没有指定字段,查询所有字段
|
||||
query = f"SELECT * FROM job_data.{json_table_name}"
|
||||
|
||||
query += f" ORDER BY created_at DESC LIMIT {limit} OFFSET {offset}"
|
||||
|
||||
# 执行查询
|
||||
result = await self.clickhouse_client.query(query)
|
||||
|
||||
# 将结果转换为字典列表
|
||||
data = []
|
||||
for row in result.result_rows:
|
||||
item = dict(zip(result.column_names, row))
|
||||
# 尝试解析json_data
|
||||
if 'json_data' in item and isinstance(item['json_data'], str):
|
||||
try:
|
||||
json_content = json.loads(item['json_data'])
|
||||
if isinstance(json_content, dict):
|
||||
item.update(json_content)
|
||||
except:
|
||||
pass
|
||||
data.append(item)
|
||||
|
||||
logger.info(f"JSON数据查询成功,从表 {json_table_name} 返回 {len(result.result_rows)} 条记录")
|
||||
return {
|
||||
"success": True,
|
||||
"data": data,
|
||||
"columns": result.column_names,
|
||||
"count": total_count,
|
||||
"table_name": json_table_name
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"JSON数据查询失败: {str(e)}")
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"查询失败: {str(e)}",
|
||||
"error": str(e)
|
||||
}
|
||||
|
||||
async def _insert_data_to_clickhouse(self, table_name: str, data: Dict[str, Any]) -> None:
|
||||
"""向ClickHouse表插入数据
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
data: 要插入的数据字典
|
||||
"""
|
||||
try:
|
||||
columns = list(data.keys())
|
||||
values = [[data.get(col) for col in columns]]
|
||||
await self.clickhouse_client.insert(f"job_data.{table_name}", values, column_names=columns)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"向表 {table_name} 插入数据失败: {str(e)}")
|
||||
raise e
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_qcwy_duplicate(self, table_name: str, job_id: str, update_date_time: str) -> Optional[
|
||||
Dict[str, Any]]:
|
||||
"""检查QCWY平台重复数据 - 基于job_id和update_date_time字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE job_id = {job_id:String}
|
||||
AND update_date_time = {udt:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"job_id": str(job_id), "udt": str(update_date_time)})
|
||||
|
||||
if result.result_rows:
|
||||
logger.info(f"发现QCWY重复数据: jobId={job_id}, updateDateTime={update_date_time}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1]
|
||||
}
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查QCWY重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_zhilian_duplicate(self, table_name: str, number: str, first_publish_time: str) -> Optional[
|
||||
Dict[str, Any]]:
|
||||
"""检查智联平台重复数据 - 基于number和first_publish_time字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE number = {number:String}
|
||||
AND first_publish_time = {fpt:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"number": str(number), "fpt": str(first_publish_time)})
|
||||
|
||||
if result.result_rows:
|
||||
logger.info(f"发现智联重复数据: number={number}, firstPublishTime={first_publish_time}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1],
|
||||
"number": number,
|
||||
"first_publish_time": first_publish_time
|
||||
}
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查智联重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_boss_duplicate(self, table_name: str, job_id: any) -> Optional[Dict[str, Any]]:
|
||||
"""检查BOSS平台重复数据 - 基于job_id字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE job_id = {job_id:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"job_id": str(job_id)})
|
||||
if result.result_rows:
|
||||
logger.info(f"发现BOSS重复数据: jobId={job_id}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查BOSS重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_boss_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""检查BOSS平台公司重复数据 - 基于company_name字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE company_name = {company_name:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
|
||||
if result.result_rows:
|
||||
logger.info(f"发现BOSS公司重复数据: companyName={company_name}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查BOSS公司重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_qcwy_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""检查QCWY平台公司重复数据 - 基于company_name字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE company_name = {company_name:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
|
||||
if result.result_rows:
|
||||
logger.info(f"发现QCWY公司重复数据: companyName={company_name}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查QCWY公司重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_qcwy_company_duplicate_by_name(self, table_name: str, company_name: str) -> Optional[
|
||||
Dict[str, Any]]:
|
||||
"""检查QCWY平台公司重复数据 - 基于company_name字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE company_name = {company_name:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
|
||||
if result.result_rows:
|
||||
logger.info(f"发现QCWY公司重复数据: companyName={company_name}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查QCWY公司重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
||||
async def _check_zhilian_company_duplicate(self, table_name: str, company_name: str) -> Optional[Dict[str, Any]]:
|
||||
"""检查智联平台公司重复数据 - 基于company_name字段"""
|
||||
try:
|
||||
query = f"""
|
||||
SELECT id, created_at
|
||||
FROM job_data.{table_name}
|
||||
WHERE company_name = {company_name:String}
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await self.clickhouse_client.query(query, parameters={"company_name": str(company_name)})
|
||||
if result.result_rows:
|
||||
logger.info(f"发现智联公司重复数据: companyName={company_name}")
|
||||
return {
|
||||
"id": result.result_rows[0][0],
|
||||
"created_at": result.result_rows[0][1]
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"检查智联公司重复数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
async def send_to_remote_server(self, data: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
发送数据到远程服务器(简化版)
|
||||
直接接收body数据并发送
|
||||
|
||||
Args:
|
||||
data: 要发送的数据字典
|
||||
|
||||
Returns:
|
||||
bool: 发送成功返回True,失败返回False
|
||||
"""
|
||||
# 打印关键词日志
|
||||
source_type = data.get('source_type', '未知平台')
|
||||
title = data.get('title', '未知职位')
|
||||
company_name = data.get('company_name', data.get('name', '未知公司'))
|
||||
logger.info(f"📤 上报数据: [{source_type}] {title} - {company_name}")
|
||||
|
||||
try:
|
||||
# 构建认证参数
|
||||
from_id = 9910056
|
||||
timestamp = int(time.time())
|
||||
salt = 'jWcIqJK6QlR2syb6HQgpel9iOoOkj01G5MDFNtQLaTxhddHUTEnURsMe2RxCTYC8'
|
||||
|
||||
# 生成token
|
||||
token_string = salt + str(timestamp)
|
||||
token = hashlib.md5(token_string.encode()).hexdigest()
|
||||
|
||||
url = f'http://external-data.qixin.com/extend/extend_data_push?from={from_id}&token={token}&time={timestamp}'
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36'
|
||||
}
|
||||
# 直接发送原始数据
|
||||
response = requests.post(url, json=data, headers=headers, timeout=30)
|
||||
# print(response.text)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
logger.error(f"❌ 数据发送失败: {response.status_code} - {response.text[:100]}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"❌ 发送异常: {str(e)}")
|
||||
return False
|
||||
|
||||
async def batch_store_data(self,
|
||||
data_list: List[Dict[str, Any]],
|
||||
data_type: DataType,
|
||||
platform: PlatformType,
|
||||
check_duplicate: bool = True) -> Dict[str, Any]:
|
||||
"""批量存储数据 - 优化版本,使用批量插入
|
||||
|
||||
Args:
|
||||
data_list: 要存储的数据列表
|
||||
data_type: 数据类型 (job/company)
|
||||
platform: 平台类型 (boss/qcwy/zhilian)
|
||||
check_duplicate: 是否检查重复数据
|
||||
|
||||
Returns:
|
||||
批量存储结果信息
|
||||
"""
|
||||
results = {
|
||||
"total": len(data_list),
|
||||
"success": 0,
|
||||
"failed": 0,
|
||||
"duplicate": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
if not data_list:
|
||||
return results
|
||||
|
||||
try:
|
||||
# 获取表名
|
||||
json_table_name = self._get_json_table_name(data_type, platform)
|
||||
|
||||
# 批量处理数据 - 直接准备插入数据,在插入时处理重复
|
||||
valid_data_list = []
|
||||
remote_push_data_list = []
|
||||
|
||||
# 第一步:准备所有数据
|
||||
for i, data in enumerate(data_list):
|
||||
try:
|
||||
# 准备插入数据
|
||||
current_time = datetime.now()
|
||||
json_data = {
|
||||
'id': 0, # 自动生成
|
||||
'json_data': json.dumps(data, ensure_ascii=False),
|
||||
'created_at': current_time,
|
||||
'updated_at': current_time
|
||||
}
|
||||
|
||||
# 添加去重字段
|
||||
self._add_dedup_fields(json_data, data, data_type, platform)
|
||||
|
||||
valid_data_list.append(json_data)
|
||||
|
||||
# 准备远程推送数据
|
||||
remote_data = await self._prepare_remote_push_data(data, data_type, platform)
|
||||
if remote_data:
|
||||
remote_push_data_list.append(remote_data)
|
||||
|
||||
except Exception as e:
|
||||
results["failed"] += 1
|
||||
results["errors"].append({
|
||||
"index": i,
|
||||
"error": f"数据预处理失败: {str(e)}"
|
||||
})
|
||||
|
||||
# 第二步:批量插入到数据库(在插入时忽略重复数据)
|
||||
if valid_data_list:
|
||||
try:
|
||||
insert_result = await self._batch_insert_to_clickhouse(json_table_name, valid_data_list,
|
||||
ignore_duplicates=check_duplicate)
|
||||
results["success"] = insert_result["inserted"]
|
||||
results["duplicate"] = insert_result["ignored"]
|
||||
# logger.info(
|
||||
# f"批量插入完成: {insert_result['inserted']} 条成功, {insert_result['ignored']} 条重复忽略")
|
||||
except Exception as e:
|
||||
# 如果批量插入完全失败,记录错误
|
||||
logger.error(f"批量插入失败: {str(e)}")
|
||||
results["failed"] = len(valid_data_list)
|
||||
results["errors"].append({
|
||||
"error": f"批量插入失败: {str(e)}"
|
||||
})
|
||||
|
||||
# 第三步:批量推送到远程服务器
|
||||
if remote_push_data_list:
|
||||
try:
|
||||
await self._batch_send_to_remote_server(remote_push_data_list)
|
||||
logger.info(f"批量推送到远程服务器成功: {len(remote_push_data_list)} 条数据")
|
||||
except Exception as e:
|
||||
logger.warning(f"批量推送到远程服务器失败: {str(e)}")
|
||||
# 远程推送失败不影响主要存储结果
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量存储数据失败: {str(e)}")
|
||||
# 如果批量处理完全失败,回退到原来的逐个处理方式
|
||||
return await self._fallback_individual_store(data_list, data_type, platform, check_duplicate)
|
||||
|
||||
return results
|
||||
|
||||
def _add_dedup_fields(self, json_data: Dict[str, Any], data: Dict[str, Any], data_type: DataType,
|
||||
platform: PlatformType):
|
||||
"""为JSON数据添加去重字段"""
|
||||
if platform == PlatformType.BOSS and data_type == DataType.JOB:
|
||||
job_base_info = data.get('jobBaseInfoVO', {})
|
||||
if job_base_info and 'jobId' in job_base_info:
|
||||
json_data['job_id'] = str(job_base_info['jobId'])
|
||||
|
||||
elif platform == PlatformType.QCWY and data_type == DataType.JOB:
|
||||
if 'jobId' in data:
|
||||
json_data['job_id'] = str(data['jobId'])
|
||||
if 'updateDateTime' in data:
|
||||
json_data['update_date_time'] = str(data['updateDateTime'])
|
||||
|
||||
elif platform == PlatformType.ZHILIAN and data_type == DataType.JOB:
|
||||
if 'number' in data:
|
||||
json_data['number'] = str(data['number'])
|
||||
if 'firstPublishTime' in data:
|
||||
json_data['first_publish_time'] = str(data['firstPublishTime'])
|
||||
|
||||
elif platform == PlatformType.BOSS and data_type == DataType.COMPANY:
|
||||
company_name = data.get('name') or data.get('companyFullInfoVO', {}).get('name')
|
||||
if company_name:
|
||||
json_data['company_name'] = str(company_name)
|
||||
|
||||
elif platform == PlatformType.QCWY and data_type == DataType.COMPANY:
|
||||
company_name = data.get('companyName') or data.get('company_name')
|
||||
if company_name:
|
||||
json_data['company_name'] = str(company_name)
|
||||
|
||||
elif platform == PlatformType.ZHILIAN and data_type == DataType.COMPANY:
|
||||
company_name = data.get('companyName') or data.get('name')
|
||||
if company_name:
|
||||
json_data['company_name'] = str(company_name)
|
||||
|
||||
async def _prepare_remote_push_data(self, data: Dict[str, Any], data_type: DataType, platform: PlatformType) -> \
|
||||
Optional[Dict[str, Any]]:
|
||||
"""准备远程推送数据"""
|
||||
if data_type != DataType.JOB:
|
||||
return None
|
||||
|
||||
try:
|
||||
if platform == PlatformType.QCWY:
|
||||
welfare_list = data.get("jobWelfareCodeDataList")
|
||||
if isinstance(welfare_list, list):
|
||||
welfare_str = ",".join(
|
||||
str(item.get("chineseTitle") or item.get("typeTitle") or item.get("englishTitle") or item.get("code"))
|
||||
for item in welfare_list if isinstance(item, dict)
|
||||
)
|
||||
elif isinstance(welfare_list, str):
|
||||
welfare_str = welfare_list.replace("[", "").replace("]", "")
|
||||
else:
|
||||
welfare_str = ""
|
||||
raw_location = data.get("location") or ""
|
||||
if not raw_location:
|
||||
work_loc = data.get("workLocation") or {}
|
||||
raw_location = work_loc.get("workAddress") or work_loc.get("address") or ""
|
||||
if raw_location:
|
||||
location_val = raw_location
|
||||
else:
|
||||
location_val = "位置信息未找到"
|
||||
raw_area = data.get("jobAreaString") or ""
|
||||
if not raw_area:
|
||||
level_detail = data.get("jobAreaLevelDetail") or {}
|
||||
city_str = level_detail.get("cityString") or ""
|
||||
landmark_str = level_detail.get("landMarkString") or ""
|
||||
raw_area = f"{city_str}{landmark_str}".strip()
|
||||
if raw_area:
|
||||
area_val = raw_area
|
||||
else:
|
||||
area_val = "位置信息未找到"
|
||||
remote_resp = {
|
||||
'source_type': '前程无忧',
|
||||
'name': data.get("companyName"),
|
||||
'title': data.get("jobName"),
|
||||
'title_addr': data.get("jobName"),
|
||||
'description': data.get("jobDescribe"),
|
||||
'age': "",
|
||||
'sex': "",
|
||||
'number': "",
|
||||
'education': data.get("degreeString"),
|
||||
'skill': await self.safe_join(data.get("jobTagsForOrder")),
|
||||
'welfare': welfare_str,
|
||||
'years': data.get("workYearString"),
|
||||
'salary': f'{data.get("jobSalaryMax", "")}-{data.get("jobSalaryMin", "")}',
|
||||
'location': location_val,
|
||||
'position': area_val,
|
||||
'date': data.get("confirmDateString"),
|
||||
'start_date': data.get("confirmDateString"),
|
||||
'end_date': "",
|
||||
'job_type': data.get("termStr"),
|
||||
'size': data.get("companySizeString"),
|
||||
'employer_type': data.get("companyTypeString"),
|
||||
'industry': f'{data.get("major1Str", "")}-{data.get("major2Str", "")}',
|
||||
'job_1st_class': "",
|
||||
'job_2nd_class': "",
|
||||
'job_3rd_class': "",
|
||||
'job_4th_class': "",
|
||||
'url': data.get("jobHref"),
|
||||
'company_id': data.get("coId"),
|
||||
'company_name': data.get("fullCompanyName"),
|
||||
'company_url': data.get("companyHref"),
|
||||
'company_desc': data.get("company_desc", ""),
|
||||
'base_data':data
|
||||
}
|
||||
return remote_resp
|
||||
|
||||
elif platform == PlatformType.BOSS:
|
||||
bossBaseInfoVO = data.get("bossBaseInfoVO", {})
|
||||
jobBaseInfoVO = data.get("jobBaseInfoVO", {})
|
||||
brandComInfoVO = data.get("brandComInfoVO", {})
|
||||
boss_resp = {
|
||||
'source_type': 'Boss直聘',
|
||||
'name': await self.safe_get(brandComInfoVO, "brandName"),
|
||||
'common_name': await self.safe_get(bossBaseInfoVO, "brandName"),
|
||||
'title': await self.safe_get(jobBaseInfoVO, "positionName"),
|
||||
'title_addr': await self.safe_get(jobBaseInfoVO, "positionName"),
|
||||
'description': await self.safe_get(jobBaseInfoVO, "jobDesc"),
|
||||
'education': await self.safe_get(jobBaseInfoVO, "degreeName"),
|
||||
'skill': await self.safe_join(jobBaseInfoVO.get("requiredSkills") if jobBaseInfoVO else None),
|
||||
'welfare': await self.safe_join(jobBaseInfoVO.get("salaryWelfareInfo") if jobBaseInfoVO else None),
|
||||
'years': await self.safe_get(jobBaseInfoVO, "experienceName"),
|
||||
'salary': f'{await self.safe_get(jobBaseInfoVO, "lowSalary")}-{await self.safe_get(jobBaseInfoVO, "highSalary")}',
|
||||
'location': await self.safe_get(jobBaseInfoVO, "locationName", "位置信息未找到"),
|
||||
'position': await self.safe_get(jobBaseInfoVO, "locationDesc", "位置信息未找到"),
|
||||
'job_type': "全职",
|
||||
'size': await self.safe_get(brandComInfoVO, "scaleName"),
|
||||
'employer_type': "全职",
|
||||
'industry': await self.safe_get(brandComInfoVO, "industryName"),
|
||||
'job_1st_class': "",
|
||||
'job_2nd_class': "",
|
||||
'job_3rd_class': "",
|
||||
'job_4th_class': "",
|
||||
'date': "",
|
||||
'start_date': "",
|
||||
'end_date': "",
|
||||
'age': "",
|
||||
'sex': "",
|
||||
'number': "",
|
||||
'url': f"https://www.zhipin.com/job_detail/{await self.safe_get(jobBaseInfoVO, 'encryptJobId')}.html",
|
||||
'company_id': await self.safe_get(brandComInfoVO, "encryptBrandId"),
|
||||
'company_name': await self.safe_get(brandComInfoVO, "brandName"),
|
||||
'company_url': f"https://www.zhipin.com/gongsi/{await self.safe_get(brandComInfoVO, 'encryptBrandId')}.html",
|
||||
'company_desc': await self.safe_get(brandComInfoVO, "introduce"),
|
||||
'base_data': data
|
||||
}
|
||||
return boss_resp
|
||||
|
||||
elif platform == PlatformType.ZHILIAN:
|
||||
# 智联平台:从原始 data 中提取所需字段
|
||||
zhilian_resp = {
|
||||
'source_type': '智联招聘',
|
||||
'name': await self.safe_get(data, 'companyName'),
|
||||
'common_name': await self.safe_get(data, 'companyName'),
|
||||
'title': await self.safe_get(data, 'name'),
|
||||
'title_addr': await self.safe_get(data, 'name'),
|
||||
'description': await self.safe_get(data, 'jobSummary'),
|
||||
'education': await self.safe_get(data, 'education'),
|
||||
'skill': await self.safe_join([tag['value'] for tag in data.get('skillLabel', [])]),
|
||||
'welfare': '', # 智联该条数据无福利字段
|
||||
'years': await self.safe_get(data, 'workingExp'),
|
||||
'salary': await self.safe_get(data, 'salary60'),
|
||||
'location': f"{await self.safe_get(data, 'workCity')}{await self.safe_get(data, 'cityDistrict')}",
|
||||
'position': f"{await self.safe_get(data, 'workCity')}{await self.safe_get(data, 'cityDistrict')}",
|
||||
'job_type': await self.safe_get(data, 'workType'),
|
||||
'size': await self.safe_get(data, 'companySize'),
|
||||
'employer_type': await self.safe_get(data, 'propertyName'),
|
||||
'industry': await self.safe_get(data, 'industryName'),
|
||||
'job_1st_class': '',
|
||||
'job_2nd_class': '',
|
||||
'job_3rd_class': '',
|
||||
'job_4th_class': '',
|
||||
'date': await self.safe_get(data, 'firstPublishTime'),
|
||||
'start_date': '',
|
||||
'end_date': '',
|
||||
'age': '',
|
||||
'sex': '',
|
||||
'number': str(await self.safe_get(data, 'recruitNumber')),
|
||||
'url': await self.safe_get(data, 'positionURL'),
|
||||
'company_id': str(await self.safe_get(data, 'companyId')),
|
||||
'company_name': await self.safe_get(data, 'companyName'),
|
||||
'company_url': await self.safe_get(data, 'companyUrl'),
|
||||
'company_desc': await self.safe_get(data, 'companyDesc'),
|
||||
'base_data': data
|
||||
}
|
||||
return zhilian_resp
|
||||
except Exception as e:
|
||||
logger.error(f"准备远程推送数据失败: {str(e)}")
|
||||
return None
|
||||
|
||||
async def _batch_insert_to_clickhouse(self, table_name: str, data_list: List[Dict[str, Any]],
|
||||
ignore_duplicates: bool = True) -> Dict[str, int]:
|
||||
"""批量插入数据到ClickHouse,支持忽略重复数据
|
||||
|
||||
Args:
|
||||
table_name: 表名
|
||||
data_list: 数据列表
|
||||
ignore_duplicates: 是否忽略重复数据
|
||||
|
||||
Returns:
|
||||
插入结果统计 {"inserted": 插入数量, "ignored": 忽略数量}
|
||||
"""
|
||||
result = {"inserted": 0, "ignored": 0}
|
||||
|
||||
if not data_list:
|
||||
return result
|
||||
|
||||
try:
|
||||
columns = list(data_list[0].keys())
|
||||
|
||||
filtered_list = data_list
|
||||
if ignore_duplicates:
|
||||
dedup_cols = self._get_dedup_columns_for_table(table_name)
|
||||
if dedup_cols:
|
||||
if len(dedup_cols) == 1:
|
||||
key_col = dedup_cols[0]
|
||||
candidate_keys = list({str(d.get(key_col, "")) for d in data_list if d.get(key_col)})
|
||||
if candidate_keys:
|
||||
query = f"""
|
||||
SELECT {key_col}
|
||||
FROM job_data.{table_name}
|
||||
WHERE {key_col} IN {{keys:Array(String)}}
|
||||
"""
|
||||
existing = await self.clickhouse_client.query(query, parameters={"keys": candidate_keys})
|
||||
existing_set = {str(r[0]) for r in existing.result_rows}
|
||||
filtered_list = [d for d in data_list if str(d.get(key_col, "")) not in existing_set]
|
||||
elif len(dedup_cols) == 2:
|
||||
c1, c2 = dedup_cols
|
||||
candidate_c1 = list({str(d.get(c1, "")) for d in data_list if d.get(c1)})
|
||||
if candidate_c1:
|
||||
query = f"""
|
||||
SELECT {c1}, {c2}
|
||||
FROM job_data.{table_name}
|
||||
WHERE {c1} IN {{keys:Array(String)}}
|
||||
"""
|
||||
existing = await self.clickhouse_client.query(query, parameters={"keys": candidate_c1})
|
||||
existing_map = {}
|
||||
for r in existing.result_rows:
|
||||
k = str(r[0])
|
||||
v = str(r[1])
|
||||
existing_map.setdefault(k, set()).add(v)
|
||||
filtered_list = [
|
||||
d for d in data_list
|
||||
if str(d.get(c1, "")) not in existing_map or str(d.get(c2, "")) not in existing_map.get(str(d.get(c1, "")), set())
|
||||
]
|
||||
|
||||
batch_values = [[item.get(col) for col in columns] for item in filtered_list]
|
||||
if batch_values:
|
||||
await self.clickhouse_client.insert(f"job_data.{table_name}", batch_values, column_names=columns)
|
||||
result["inserted"] = len(batch_values)
|
||||
result["ignored"] = len(data_list) - result["inserted"]
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量插入到表 {table_name} 失败: {str(e)}")
|
||||
raise e
|
||||
|
||||
return result
|
||||
|
||||
def _get_dedup_columns_for_table(self, table_name: str) -> List[str]:
|
||||
"""获取表的去重列"""
|
||||
if table_name == "boss_job":
|
||||
return ["job_id"]
|
||||
if table_name == "qcwy_job":
|
||||
return ["job_id", "update_date_time"]
|
||||
if table_name == "zhilian_job":
|
||||
return ["number", "first_publish_time"]
|
||||
if table_name in ("boss_company", "qcwy_company", "zhilian_company"):
|
||||
return ["company_name"]
|
||||
return []
|
||||
|
||||
async def _batch_send_to_remote_server(self, data_list: List[Dict[str, Any]]) -> None:
|
||||
"""批量发送数据到远程服务器"""
|
||||
for data in data_list:
|
||||
try:
|
||||
await self.send_to_remote_server(data)
|
||||
except Exception as e:
|
||||
logger.error(f"批量推送单条数据失败: {str(e)}")
|
||||
# 继续处理下一条数据
|
||||
|
||||
async def _fallback_individual_store(self, data_list: List[Dict[str, Any]], data_type: DataType,
|
||||
platform: PlatformType, check_duplicate: bool) -> Dict[str, Any]:
|
||||
"""回退到逐个存储的方法"""
|
||||
results = {
|
||||
"total": len(data_list),
|
||||
"success": 0,
|
||||
"failed": 0,
|
||||
"duplicate": 0,
|
||||
"errors": []
|
||||
}
|
||||
|
||||
for i, data in enumerate(data_list):
|
||||
try:
|
||||
result = await self.store_data(data, data_type, platform, check_duplicate)
|
||||
if result["success"]:
|
||||
results["success"] += 1
|
||||
elif result.get("duplicate"):
|
||||
results["duplicate"] += 1
|
||||
else:
|
||||
results["failed"] += 1
|
||||
results["errors"].append({
|
||||
"index": i,
|
||||
"error": result.get("message", "未知错误")
|
||||
})
|
||||
except Exception as e:
|
||||
results["failed"] += 1
|
||||
results["errors"].append({
|
||||
"index": i,
|
||||
"error": str(e)
|
||||
})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# 创建全局实例的工厂函数
|
||||
def create_data_router_service(clickhouse_client: AsyncClient) -> DataRouterService:
|
||||
return DataRouterService(clickhouse_client)
|
||||
@ -1,46 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# 配置 apt-get 使用阿里云镜像源
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources || \
|
||||
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list || true
|
||||
|
||||
# Install system dependencies
|
||||
# Node.js is required for PyExecJS
|
||||
RUN apt-get update && apt-get install -y \
|
||||
nodejs \
|
||||
npm \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 配置 npm 使用淘宝镜像源
|
||||
RUN npm config set registry https://registry.npmmirror.com
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 配置 pip 使用国内镜像源(创建配置文件)
|
||||
RUN mkdir -p /root/.pip && \
|
||||
echo '[global]' > /root/.pip/pip.conf && \
|
||||
echo 'index-url = https://pypi.tuna.tsinghua.edu.cn/simple' >> /root/.pip/pip.conf && \
|
||||
echo 'trusted-host = pypi.tuna.tsinghua.edu.cn' >> /root/.pip/pip.conf && \
|
||||
echo 'timeout = 120' >> /root/.pip/pip.conf
|
||||
|
||||
# Copy requirements first to leverage cache
|
||||
COPY requirements.txt .
|
||||
|
||||
# 使用配置的镜像源安装依赖
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install Playwright browsers and system dependencies
|
||||
# We only need chromium for this project
|
||||
# 配置 Playwright 使用国内镜像
|
||||
ENV PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright
|
||||
RUN playwright install chromium
|
||||
RUN playwright install-deps chromium
|
||||
|
||||
COPY . .
|
||||
|
||||
# Expose the port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
@ -1,12 +0,0 @@
|
||||
curl --location 'http://127.0.0.1:9999/api/v1/company/qcwy/search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"keyword": "中信期货有限公司"
|
||||
}'
|
||||
|
||||
curl --location 'http://127.0.0.1:9999/api/v1/company/zhilian/search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"keyword": "中信期货有限公司",
|
||||
"city":"北京"
|
||||
}'
|
||||
@ -1,65 +0,0 @@
|
||||
window = {};
|
||||
|
||||
var arg3 = null;
|
||||
var arg4 = null;
|
||||
var arg5 = null;
|
||||
var arg6 = null;
|
||||
var arg7 = null;
|
||||
var arg8 = null;
|
||||
var arg9 = null;
|
||||
var arg10 = null;
|
||||
|
||||
var l = function (arg1) {
|
||||
while (window["_phantom"] || window["__phantomas"]) {
|
||||
}
|
||||
|
||||
var _0x5e8b26 = "3000176000856006061501533003690027800375";
|
||||
|
||||
String["prototype"]["hexXor"] = function (_0x4e08d8) {
|
||||
var _0x5a5d3b = "";
|
||||
|
||||
for (var _0xe89588 = 0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
|
||||
var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
|
||||
|
||||
var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
|
||||
|
||||
var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
|
||||
|
||||
if (_0x189e2c["length"] == 1) {
|
||||
_0x189e2c = "0" + _0x189e2c;
|
||||
}
|
||||
|
||||
_0x5a5d3b += _0x189e2c;
|
||||
}
|
||||
|
||||
return _0x5a5d3b;
|
||||
};
|
||||
|
||||
String["prototype"]["unsbox"] = function () {
|
||||
var _0x4b082b = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
|
||||
var _0x4da0dc = [];
|
||||
var _0x12605e = "";
|
||||
|
||||
for (var _0x20a7bf = 0; _0x20a7bf < this["length"]; _0x20a7bf++) {
|
||||
var _0x385ee3 = this[_0x20a7bf];
|
||||
|
||||
for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
|
||||
if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
|
||||
_0x4da0dc[_0x217721] = _0x385ee3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_0x12605e = _0x4da0dc["join"]("");
|
||||
return _0x12605e;
|
||||
};
|
||||
|
||||
var _0x23a392 = arg1["unsbox"]();
|
||||
|
||||
arg2 = _0x23a392["hexXor"](_0x5e8b26);
|
||||
console.log('arg2==>', arg2)
|
||||
// setTimeout("reload(arg2)", 2);
|
||||
return arg2
|
||||
};
|
||||
// var arg1 = "FAA6CB46CF724D58FF82E5310687947623413114";
|
||||
// l(arg1)
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,91 +0,0 @@
|
||||
import crypto from 'crypto';
|
||||
|
||||
// 使用 crypto
|
||||
const hmacSHA256 = (message, key) => {
|
||||
return crypto
|
||||
.createHmac('sha256', key)
|
||||
.update(message)
|
||||
.digest('hex');
|
||||
};
|
||||
|
||||
function A(t) {
|
||||
console.log(t.url)
|
||||
console.log(t.data)
|
||||
var e = t.url + (t.data || "")
|
||||
return hmacSHA256(e, "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
|
||||
}
|
||||
|
||||
|
||||
function a(e) {
|
||||
for (var t = 1; t < arguments.length; t++) {
|
||||
var n = null != arguments[t] ? arguments[t] : {};
|
||||
t % 2 ? r(Object(n), !0).forEach((function (t) {
|
||||
Object(i["a"])(e, t, n[t])
|
||||
}
|
||||
)) : Object.getOwnPropertyDescriptors ? Object.defineProperties(e, Object.getOwnPropertyDescriptors(n)) : r(Object(n)).forEach((function (t) {
|
||||
Object.defineProperty(e, t, Object.getOwnPropertyDescriptor(n, t))
|
||||
}
|
||||
))
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
var t = {
|
||||
"transitional": {"silentJSONParsing": true, "forcedJSONParsing": true, "clarifyTimeoutError": false},
|
||||
"transformRequest": [null],
|
||||
"transformResponse": [null],
|
||||
"timeout": 30000,
|
||||
"xsrfCookieName": "XSRF-TOKEN",
|
||||
"xsrfHeaderName": "X-XSRF-TOKEN",
|
||||
"maxContentLength": -1,
|
||||
"maxBodyLength": -1,
|
||||
"headers": {
|
||||
"common": {"Accept": "application/json, text/plain, */*"},
|
||||
"delete": {},
|
||||
"get": {"Content-Type": "application/x-www-form-urlencoded"},
|
||||
"head": {},
|
||||
"post": {"Content-Type": "application/json"},
|
||||
"put": {"Content-Type": "application/x-www-form-urlencoded"},
|
||||
"patch": {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
},
|
||||
"baseURL": "https://we.51job.com",
|
||||
"withCredentials": true,
|
||||
"url": "/api/job/search-pc?api_key=51job×tamp=1769136341&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&function=&industry=&jobArea=010000&jobArea2=&landmark=&metro=&salary=&workYear=°ree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=1&requestId=&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&scene=7",
|
||||
"method": "get",
|
||||
"property": {"keywordType": ""}
|
||||
};
|
||||
|
||||
var b = {
|
||||
"partner": "cn_bing_com",
|
||||
"webId": 2,
|
||||
"fromdomain": "51job_web",
|
||||
"frompageUrl": "https://we.51job.com/",
|
||||
"pageUrl": "https://we.51job.com/pc/search?jobArea=010000&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&keywordType=",
|
||||
"identityType": "",
|
||||
"userType": "",
|
||||
"isLogin": "否",
|
||||
"accountid": ""
|
||||
}
|
||||
console.log(A(t));
|
||||
|
||||
// function wordsToHex(words) {
|
||||
// // CryptoJS 使用 32 位有符号整数存储,需要处理
|
||||
// let hex = '';
|
||||
// for (let i = 0; i < words.length; i++) {
|
||||
// // 将负数转换为无符号整数
|
||||
// const word = words[i] >>> 0;
|
||||
// // 转换为十六进制并补零
|
||||
// hex += word.toString(16).padStart(8, '0');
|
||||
// }
|
||||
// return hex;
|
||||
// }
|
||||
//
|
||||
// // 你的数据
|
||||
// const words = [-762966511, 1702028048, 94455509, -201850815,
|
||||
// 300412866, 1405396681, 85275542, 246713406];
|
||||
//
|
||||
// console.log(wordsToHex(words))
|
||||
|
||||
/*
|
||||
* sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTliZThkNzFmODIxM2YxLTBmZDliOTEwODEzYWE1OC00YzY1N2I1OC0zNjg2NDAwLTE5YmU4ZDcxZjgzMTcxNiJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%7D; ssxmod_itna=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2Yea0xGXKwDA5DnCx7YDt=RcwxK06dvxK=W0mitswe6uDuYLP2GGRRgW_GlDMEHLA6C5N7qxDHwd4KxGLDY=DCqxq57eD4f3Dt4DIDAYDDxDWDYEPxGUQDG=D7rTi5pWtxi3DboaDmd2WC=FD03q=EWFoDDtAbeG2bETqDDNqF9G3_lh3_PD_bW9QKtWemFxPneDMbxGX7YCqnlH2oyDWpFkUsao3xB=gxBQbyPnhwETadZanDY4lGrWYY2DIjGxWxiGG1i05Q03nwsWmwlG1Gv_GDxhw4SrUDDAt_hWDHBRqW0tK2lj5/bc_9yYtAbYW=LgrRxxWqqRAOIotBhNi47fD5=4qf0esGthu5oiPeD; ssxmod_itna2=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2YeaKxDfrQfQGh4qBFjq03_jSefWDlO03BqKSSfAaeFuhD2y0F5nKj4LMzWF2qLViLAjiLzGteYAj1KAULY4hzS3=uiLiHAktq7AQK04=RCrN4_lNnDaNPYDr4nhTEjfu/3d5Fcwil7pUxfDu7yjj5TT0UnkUbM4F0FALQk19oO64i1g2QsibdzqxtPn8oOB3wpj5FVm6R_LF2EKxZIWFfaGt9oNT4U_0IjQx40hUsUKLNOBzuR1Mh=_gTlLdLS53B3OE4dGDB8GdjhOf4MYhuE37oTUMtTCwOOD7WhhjwgohMumFghOCNeDxRqr92NTeIRW=oOeThvw7DBG5/DoFShd7v5ZxwYEKiDD
|
||||
* */
|
||||
@ -1,327 +0,0 @@
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import execjs
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
from urllib.parse import unquote, quote
|
||||
from typing import Optional, Dict
|
||||
import os
|
||||
|
||||
|
||||
class SignGenerator:
|
||||
def __init__(self):
|
||||
# 签名密钥(从JS代码中获取)
|
||||
self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||||
self.secret_key_bytes = self.secret_key.encode('utf-8')
|
||||
|
||||
def hmac_sha256(self, message, key):
|
||||
"""HMAC-SHA256签名"""
|
||||
key_bytes = key.encode('utf-8') if isinstance(key, str) else key
|
||||
message_bytes = message.encode('utf-8') if isinstance(message, str) else message
|
||||
|
||||
signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
|
||||
return signature.hexdigest()
|
||||
|
||||
def generate_signature(self, t):
|
||||
"""
|
||||
生成签名(对应JS中的函数A)
|
||||
JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
|
||||
"""
|
||||
# 获取URL
|
||||
url = t.get("url", "")
|
||||
|
||||
# 获取data,如果不存在则使用空字符串
|
||||
data = t.get("data", "")
|
||||
if data and isinstance(data, dict):
|
||||
# 如果data是字典,转换为字符串
|
||||
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
||||
|
||||
# 拼接字符串
|
||||
message = url + data
|
||||
|
||||
# 生成签名
|
||||
signature = self.hmac_sha256(message, self.secret_key)
|
||||
return signature
|
||||
|
||||
def generate_signature_from_components(self, url, data=None):
|
||||
"""从URL和data生成签名(更简单的接口)"""
|
||||
if data is None:
|
||||
data = ""
|
||||
elif isinstance(data, dict):
|
||||
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
||||
|
||||
message = url + data
|
||||
return self.hmac_sha256(message, self.secret_key)
|
||||
|
||||
def generate_acw_sc__v2(self, arg1):
|
||||
"""生成cookies acw_sc__v2"""
|
||||
# 获取当前文件所在目录
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
js_file_path = os.path.join(current_dir, '04.js')
|
||||
with open(js_file_path, 'r', encoding='utf-8') as f:
|
||||
js = f.read()
|
||||
acw_sc__v2 = execjs.compile(js).call('l', arg1)
|
||||
return acw_sc__v2 if acw_sc__v2 else None
|
||||
|
||||
def generate_company_detail(self, cid: str) -> dict:
|
||||
|
||||
timestamp = int(time.time())
|
||||
# 待签名的字符串
|
||||
message = f"/open/noauth/company-info/pc-info?api_key=51job×tamp={timestamp}&encryCompanyId={cid}"
|
||||
secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||||
# 进行 HMAC-SHA256 签名
|
||||
signature = hmac.new(
|
||||
key=secret.encode("utf-8"),
|
||||
msg=message.encode("utf-8"),
|
||||
digestmod=hashlib.sha256
|
||||
).hexdigest()
|
||||
return {"signature": signature, "timestamp": timestamp}
|
||||
|
||||
|
||||
def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
|
||||
"""
|
||||
根据关键字搜索公司信息
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键字(公司名称)
|
||||
job_area: 工作区域代码,默认"000000"表示全国
|
||||
|
||||
Returns:
|
||||
如果找到匹配的公司,返回包含fullCompanyName, companyName, companyHref的字典
|
||||
否则返回None
|
||||
"""
|
||||
signer = SignGenerator()
|
||||
session = requests.Session()
|
||||
|
||||
# 生成时间戳
|
||||
timestamp = str(int(time.time()))
|
||||
|
||||
# 构建请求参数
|
||||
params = {
|
||||
'api_key': '51job',
|
||||
'timestamp': timestamp,
|
||||
'keyword': keyword,
|
||||
'searchType': '2', # 2表示搜索公司
|
||||
'function': '',
|
||||
'industry': '',
|
||||
'jobArea': job_area,
|
||||
'jobArea2': '',
|
||||
'landmark': '',
|
||||
'metro': '',
|
||||
'salary': '',
|
||||
'workYear': '',
|
||||
'degree': '',
|
||||
'companyType': '',
|
||||
'companySize': '',
|
||||
'jobType': '',
|
||||
'issueDate': '',
|
||||
'sortType': '0',
|
||||
'pageNum': '1',
|
||||
'requestId': '',
|
||||
'pageSize': '20',
|
||||
'source': '1',
|
||||
'accountId': '',
|
||||
'pageCode': 'sou|sou|soulb',
|
||||
'scene': '7'
|
||||
}
|
||||
|
||||
# 构建URL用于生成签名
|
||||
# 注意:签名时keyword需要URL编码,其他参数保持原样
|
||||
url_path = '/api/job/search-pc'
|
||||
query_parts = []
|
||||
for k, v in params.items():
|
||||
if v:
|
||||
# keyword参数需要URL编码(与浏览器行为一致)
|
||||
if k == 'keyword':
|
||||
query_parts.append(f'{k}={quote(str(v))}')
|
||||
else:
|
||||
query_parts.append(f'{k}={str(v)}')
|
||||
else:
|
||||
query_parts.append(f'{k}=')
|
||||
query_string = '&'.join(query_parts)
|
||||
full_url = f"{url_path}?{query_string}"
|
||||
|
||||
# 生成签名
|
||||
sign = signer.generate_signature_from_components(full_url)
|
||||
|
||||
# 构建请求头
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'From-Domain': '51job_web',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||
'account-id': '',
|
||||
'partner': '',
|
||||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
|
||||
keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
|
||||
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sign': sign,
|
||||
'user-token': '',
|
||||
'uuid': str(uuid.uuid4()).replace('-', ''),
|
||||
}
|
||||
|
||||
try:
|
||||
# 第一次请求,可能会返回arg1用于生成acw_sc__v2
|
||||
response = session.get(
|
||||
'https://we.51job.com/api/job/search-pc',
|
||||
params=params,
|
||||
headers=headers,
|
||||
verify=False,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# 检查是否需要处理acw_sc__v2反爬虫
|
||||
if 'arg1' in response.text:
|
||||
# 提取arg1
|
||||
arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
|
||||
if arg1_match:
|
||||
arg1 = arg1_match[0]
|
||||
# 生成acw_sc__v2
|
||||
acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
|
||||
if acw_sc__v2:
|
||||
# 生成guid
|
||||
guid = str(uuid.uuid4()).replace("-", "")
|
||||
cookies = {
|
||||
'guid': guid,
|
||||
'acw_sc__v2': acw_sc__v2
|
||||
}
|
||||
|
||||
# 第二次请求,带上cookies
|
||||
response2 = session.get(
|
||||
'https://we.51job.com/api/job/search-pc',
|
||||
params=params,
|
||||
headers=headers,
|
||||
cookies=cookies,
|
||||
verify=False,
|
||||
timeout=30
|
||||
)
|
||||
# 更新cookies
|
||||
cookies.update(response2.cookies.get_dict())
|
||||
|
||||
# 第三次请求,使用完整的cookies
|
||||
response = session.get(
|
||||
'https://we.51job.com/api/job/search-pc',
|
||||
params=params,
|
||||
headers=headers,
|
||||
cookies=cookies,
|
||||
verify=False,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# 解析响应
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
data = response.json()
|
||||
# print(data)
|
||||
if data.get('status') == '1' and 'resultbody' in data:
|
||||
resultbody = data['resultbody']
|
||||
if 'job' in resultbody and 'items' in resultbody['job']:
|
||||
items = resultbody['job']['items']
|
||||
|
||||
# 遍历所有职位,查找匹配的公司
|
||||
for item in items:
|
||||
print(item)
|
||||
full_company_name = item.get('fullCompanyName', '').strip()
|
||||
if full_company_name == keyword.strip():
|
||||
return {
|
||||
'fullCompanyName': full_company_name,
|
||||
'companyName': item.get('companyName', '').strip(),
|
||||
'companyHref': item.get('companyHref', '').strip()
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"[错误] 请求失败: {e}")
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_company_desc(uri: str) -> dict:
|
||||
"""解析HTML页面,返回字典格式(不使用pandas)
|
||||
|
||||
Args:
|
||||
uri: 页面URL
|
||||
|
||||
Returns:
|
||||
包含location和company_desc的字典
|
||||
"""
|
||||
"https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
|
||||
sy = uri.split("/")[-1].replace(".html", "")
|
||||
# print(sy)
|
||||
if sy.startswith("co"):
|
||||
cid = sy.replace("co", "")
|
||||
else:
|
||||
cid = sy
|
||||
signer = SignGenerator()
|
||||
generate_company_detail_info = signer.generate_company_detail(cid)
|
||||
headers = {
|
||||
'Host': 'cupid.51job.com',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'sign': generate_company_detail_info["signature"],
|
||||
'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
|
||||
'From-Domain': '51job_web',
|
||||
'account-id': '',
|
||||
'user-token': '',
|
||||
'partner': '',
|
||||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
|
||||
'Origin': 'https://jobs.51job.com',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'https://jobs.51job.com/',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-site',
|
||||
'TE': 'trailers',
|
||||
}
|
||||
|
||||
try:
|
||||
# 使用已配置的代理发送请求
|
||||
desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job×tamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
|
||||
res = requests.get(url=desc_url, headers=headers, verify=False)
|
||||
# print(res.text)
|
||||
if not res:
|
||||
return {"company_desc": "请求失败", "company_location": "请求失败"}
|
||||
company_dinfo = res.json()
|
||||
print(company_dinfo)
|
||||
# print(company_dinfo["resultbody"])
|
||||
|
||||
coinfo = company_dinfo["resultbody"]["coinfo"]
|
||||
return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析HTML失败: {e}")
|
||||
return {"company_desc": "解析失败", "company_location": "解析失败"}
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 测试搜索
|
||||
keyword = "华为技术有限公司"
|
||||
result = search_company(keyword)
|
||||
if result:
|
||||
print(f"找到匹配的公司:")
|
||||
print(f" 全称: {result['fullCompanyName']}")
|
||||
print(f" 简称: {result['companyName']}")
|
||||
print(f" 链接: {result['companyHref']}")
|
||||
else:
|
||||
print(f"未找到匹配的公司: {keyword}")
|
||||
print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))
|
||||
@ -1,92 +0,0 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
import re
|
||||
import json
|
||||
import execjs
|
||||
|
||||
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'From-Domain': '51job_web',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': 'https://we.51job.com/pc/search?keyword=java&searchType=2&sortType=0&metro=',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||
'account-id': '',
|
||||
'partner': '',
|
||||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3Djava%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
|
||||
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sign': '839932c059141791d8a003f0e6652e14facbf788a502df374fecf9c107d93b9e',
|
||||
'user-token': '',
|
||||
'uuid': '1687228791235576552',
|
||||
}
|
||||
params = {
|
||||
'api_key': '51job',
|
||||
'timestamp': '1769139097',
|
||||
'keyword': '华为技术有限公司',
|
||||
'searchType': '2',
|
||||
'function': '',
|
||||
'industry': '',
|
||||
'jobArea': '000000',
|
||||
'jobArea2': '',
|
||||
'landmark': '',
|
||||
'metro': '',
|
||||
'salary': '',
|
||||
'workYear': '',
|
||||
'degree': '',
|
||||
'companyType': '',
|
||||
'companySize': '',
|
||||
'jobType': '',
|
||||
'issueDate': '',
|
||||
'sortType': '0',
|
||||
'pageNum': '1',
|
||||
'requestId': '',
|
||||
'pageSize': '20',
|
||||
'source': '1',
|
||||
'accountId': '',
|
||||
'pageCode': 'sou|sou|soulb',
|
||||
'scene':'7'
|
||||
}
|
||||
|
||||
# 换成自己的代理,或者不用,单个ip应该有限制
|
||||
proxies = {
|
||||
"http":"http://xxx",
|
||||
"https":"http://xxxx"
|
||||
}
|
||||
|
||||
|
||||
for i in range(1,2):
|
||||
try:
|
||||
# cookie = {'guid': 'd02dfbabd84858301947663946e1710f'}
|
||||
session = requests.session()
|
||||
print("第%s次请求:" % i)
|
||||
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,verify=False) # 关键:禁用SSL验证)
|
||||
print(response.text[:300])
|
||||
arg1 = re.findall("arg1='(.*?)';",response.text,re.S)[0]
|
||||
print('arg1--->',arg1)
|
||||
guid = str(uuid.uuid4()).replace("-", "")
|
||||
cookie = {'guid': str(guid)}
|
||||
|
||||
with open('04.js', 'r', encoding='utf-8') as f:
|
||||
js = f.read()
|
||||
acw_sc__v2 = execjs.compile(js).call('l', arg1)
|
||||
print('acw_sc__v2-->',acw_sc__v2)
|
||||
cookie.update({"acw_sc__v2": acw_sc__v2})
|
||||
# cookie.update({"acw_sc__v3": "649257ebe376df87b3db6a94c1e5ad37f42f783b"})
|
||||
response2 = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,cookies=cookie,verify=False) #
|
||||
cookie.update(response2.cookies.get_dict())
|
||||
|
||||
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers, cookies=cookie,verify=False)
|
||||
print(response.text)
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
@ -1 +0,0 @@
|
||||
# 智联招聘
|
||||
File diff suppressed because it is too large
Load Diff
@ -1,68 +0,0 @@
|
||||
import math
|
||||
import copy
|
||||
|
||||
R = "0123456789ABCDEFGHIJKLMNOPQRSTUV"
|
||||
|
||||
|
||||
def a(e: str) -> str:
|
||||
t = int(e, 2)
|
||||
return R[t]
|
||||
|
||||
|
||||
def n(e: str) -> str:
|
||||
t = ""
|
||||
|
||||
# 等价 charCodeAt + 16bit
|
||||
for ch in e:
|
||||
t += format(ord(ch), "016b")
|
||||
|
||||
# padEnd 到 5 的倍数
|
||||
pad_len = 5 * math.ceil(len(t) / 5)
|
||||
t = t.ljust(pad_len, "0")
|
||||
|
||||
r = ""
|
||||
for i in range(0, len(t), 5):
|
||||
r += a(t[i:i + 5])
|
||||
|
||||
return r
|
||||
|
||||
|
||||
def generate_url(e: dict) -> str:
|
||||
t = []
|
||||
o = copy.deepcopy(e)
|
||||
|
||||
if o.get("jl"):
|
||||
t.append(f"jl{o['jl']}")
|
||||
del o["jl"]
|
||||
|
||||
if o.get("jt"):
|
||||
t.append(f"jt{o['jt']}")
|
||||
del o["jt"]
|
||||
|
||||
if o.get("in"):
|
||||
t.append(f"in{o['in']}")
|
||||
del o["in"]
|
||||
|
||||
if o.get("kw"):
|
||||
t.append(f"kw{n(o['kw'])}")
|
||||
del o["kw"]
|
||||
|
||||
if o.get("p"):
|
||||
t.append(f"p{o['p']}")
|
||||
del o["p"]
|
||||
|
||||
r = []
|
||||
for key, value in o.items():
|
||||
if value:
|
||||
r.append(f"{key}={value}")
|
||||
|
||||
a_path = "/".join(t)
|
||||
if r:
|
||||
a_path += "?" + "&".join(r)
|
||||
|
||||
return a_path
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = f"https://www.zhaopin.com/sou/{generate_url({'jl': 530, 'kw': 'app推广经理'})}"
|
||||
print(url)
|
||||
@ -1,284 +0,0 @@
|
||||
from playwright.sync_api import sync_playwright, BrowserContext, Page
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from company_spider.zhilianzhaopin_company.searcc_kw import generate_url
|
||||
|
||||
|
||||
class CityLoader:
|
||||
_instance = None
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if not cls._instance:
|
||||
cls._instance = super(CityLoader, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, city_file="city.json"):
|
||||
if hasattr(self, 'city_map'):
|
||||
return
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.file_path = os.path.join(current_dir, city_file)
|
||||
self.city_map = {}
|
||||
self._load_cities()
|
||||
|
||||
def _load_cities(self):
|
||||
if not os.path.exists(self.file_path):
|
||||
print(f"City file not found: {self.file_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
with open(self.file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self._parse_city_data(data.get("allCity", []))
|
||||
except Exception as e:
|
||||
print(f"Error loading city file: {e}")
|
||||
|
||||
def _parse_city_data(self, cities):
|
||||
for city in cities:
|
||||
self.city_map[city['name']] = city['code']
|
||||
if 'sublist' in city and city['sublist']:
|
||||
self._parse_city_data(city['sublist'])
|
||||
|
||||
def get_code(self, city_name):
|
||||
return self.city_map.get(city_name)
|
||||
|
||||
|
||||
def get_companies_from_page(page: Page) -> List[Dict[str, str]]:
|
||||
"""从搜索结果页面获取公司名称和链接"""
|
||||
companies = []
|
||||
|
||||
# 尝试多种选择器来定位公司名称
|
||||
company_selectors = [
|
||||
'a[class*="company"]',
|
||||
'.company-name a',
|
||||
'a.company-name',
|
||||
'[class*="CompanyName"] a',
|
||||
'a[href*="/company/"]'
|
||||
]
|
||||
|
||||
company_elements = []
|
||||
for selector in company_selectors:
|
||||
try:
|
||||
elements = page.query_selector_all(selector)
|
||||
if elements:
|
||||
company_elements = elements
|
||||
print(f"使用选择器找到 {len(elements)} 个元素: {selector}")
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# 如果没找到,尝试更通用的方法
|
||||
if not company_elements:
|
||||
all_links = page.query_selector_all('a[href*="company"]')
|
||||
company_elements = all_links
|
||||
print(f"通过通用方法找到 {len(all_links)} 个公司链接")
|
||||
|
||||
# 提取公司信息
|
||||
company_info_set = set()
|
||||
|
||||
for element in company_elements:
|
||||
try:
|
||||
company_name = element.inner_text().strip()
|
||||
company_url = element.get_attribute('href')
|
||||
|
||||
if company_name and company_url:
|
||||
# 处理相对路径
|
||||
if company_url.startswith('/'):
|
||||
company_url = f"https://www.zhaopin.com{company_url}"
|
||||
elif not company_url.startswith('http'):
|
||||
company_url = f"https://www.zhaopin.com/{company_url}"
|
||||
|
||||
# 去重
|
||||
if company_name not in company_info_set:
|
||||
company_info_set.add(company_name)
|
||||
companies.append({
|
||||
'name': company_name,
|
||||
'url': company_url
|
||||
})
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return companies
|
||||
|
||||
|
||||
def get_company_intro(context: BrowserContext, company_url: str) -> str:
|
||||
"""获取公司详情简介"""
|
||||
try:
|
||||
company_page = context.new_page()
|
||||
company_page.goto(company_url, wait_until="networkidle", timeout=30000)
|
||||
time.sleep(2)
|
||||
|
||||
# 尝试多种选择器获取公司简介
|
||||
intro_selectors = [
|
||||
'.company-intro',
|
||||
'.company-description',
|
||||
'[class*="intro"]',
|
||||
'[class*="description"]',
|
||||
'.company-info',
|
||||
'[class*="CompanyIntro"]'
|
||||
]
|
||||
|
||||
company_intro = ""
|
||||
for selector in intro_selectors:
|
||||
try:
|
||||
intro_element = company_page.query_selector(selector)
|
||||
if intro_element:
|
||||
company_intro = intro_element.inner_text().strip()
|
||||
if company_intro:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# 如果还是没找到,尝试获取页面主要内容
|
||||
if not company_intro:
|
||||
try:
|
||||
body = company_page.query_selector('body')
|
||||
if body:
|
||||
all_text = body.inner_text()
|
||||
company_intro = all_text[:500]
|
||||
except:
|
||||
pass
|
||||
|
||||
company_page.close()
|
||||
return company_intro if company_intro else "未找到公司简介"
|
||||
|
||||
except Exception as e:
|
||||
return f"获取失败: {str(e)}"
|
||||
|
||||
|
||||
def crawl_companies(params: Dict, max_companies: int = 10, headless: bool = False, proxy: Optional[str] = None) -> List[Dict]:
|
||||
"""
|
||||
爬取智联招聘公司信息
|
||||
|
||||
Args:
|
||||
params: 搜索参数,如 {'jl': 530, 'kw': 'app推广经理'} 或 {'city': '北京', 'kw': '...'}
|
||||
max_companies: 最多爬取的公司数量,默认10
|
||||
headless: 是否无头模式,默认False
|
||||
proxy: 代理地址,例如 "http://user:pass@host:port"
|
||||
|
||||
Returns:
|
||||
公司信息列表,每个元素包含 name, url, intro
|
||||
如果找到完全匹配的公司名称,只返回该公司的信息(列表长度为1)
|
||||
"""
|
||||
# 处理城市名称转代码
|
||||
if 'city' in params and 'jl' not in params:
|
||||
city_loader = CityLoader()
|
||||
code = city_loader.get_code(params['city'])
|
||||
if code:
|
||||
print(f"城市 '{params['city']}' 映射代码为: {code}")
|
||||
params['jl'] = code
|
||||
else:
|
||||
print(f"未找到城市 '{params['city']}' 的代码")
|
||||
|
||||
target_company = params.get('kw', '').strip()
|
||||
|
||||
with sync_playwright() as p:
|
||||
launch_args = ["--disable-blink-features=AutomationControlled"]
|
||||
browser_kwargs = {
|
||||
"headless": headless,
|
||||
"args": launch_args
|
||||
}
|
||||
|
||||
# 尝试使用本地Chrome,如果不存在则使用默认浏览器
|
||||
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
||||
if os.path.exists(chrome_path):
|
||||
browser_kwargs["executable_path"] = chrome_path
|
||||
|
||||
if proxy:
|
||||
browser_kwargs["proxy"] = {"server": proxy}
|
||||
print(f"使用代理: {proxy}")
|
||||
|
||||
browser = p.chromium.launch(**browser_kwargs)
|
||||
|
||||
context = browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# 生成URL并访问搜索页面
|
||||
url = f"https://www.zhaopin.com/sou/{generate_url(params)}"
|
||||
print(f"访问URL: {url}")
|
||||
page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
time.sleep(3)
|
||||
|
||||
# 获取公司列表
|
||||
companies = get_companies_from_page(page)
|
||||
print(f"\n找到 {len(companies)} 家公司")
|
||||
|
||||
# 如果有关键词,尝试精确匹配公司名称
|
||||
if target_company:
|
||||
print(f"搜索目标公司: {target_company}")
|
||||
for company in companies:
|
||||
company_name = company['name'].strip()
|
||||
# 优先精确匹配,如果精确匹配失败则尝试包含匹配
|
||||
if company_name == target_company:
|
||||
print(f"找到完全匹配的公司: {company_name}")
|
||||
print(f"正在获取公司简介...")
|
||||
company_intro = get_company_intro(context, company['url'])
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
return [{
|
||||
'name': company_name,
|
||||
'url': company['url'],
|
||||
'intro': company_intro
|
||||
}]
|
||||
|
||||
# 如果精确匹配失败,尝试包含匹配
|
||||
for company in companies:
|
||||
company_name = company['name'].strip()
|
||||
if target_company in company_name or company_name in target_company:
|
||||
print(f"找到部分匹配的公司: {company_name}")
|
||||
print(f"正在获取公司简介...")
|
||||
company_intro = get_company_intro(context, company['url'])
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
return [{
|
||||
'name': company_name,
|
||||
'url': company['url'],
|
||||
'intro': company_intro
|
||||
}]
|
||||
|
||||
# 如果没有找到匹配的公司,按原逻辑获取多家公司
|
||||
print(f"未找到完全匹配的公司,获取前 {max_companies} 家公司信息")
|
||||
results = []
|
||||
for i, company in enumerate(companies[:max_companies], 1):
|
||||
print(f"\n[{i}/{min(max_companies, len(companies))}] 正在获取: {company['name']}")
|
||||
company_intro = get_company_intro(context, company['url'])
|
||||
|
||||
results.append({
|
||||
'name': company['name'],
|
||||
'url': company['url'],
|
||||
'intro': company_intro
|
||||
})
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试代码
|
||||
params = {'city': '北京', 'kw': 'app推广经理'}
|
||||
results = crawl_companies(params, max_companies=10)
|
||||
|
||||
# 输出结果
|
||||
print("\n" + "="*80)
|
||||
print("爬取结果:")
|
||||
print("="*80)
|
||||
for result in results:
|
||||
print(f"\n公司名称: {result['name']}")
|
||||
print(f"公司链接: {result['url']}")
|
||||
print(f"公司简介: {result['intro'][:200]}..." if len(result['intro']) > 200 else f"公司简介: {result['intro']}")
|
||||
print("-"*80)
|
||||
593
docs/爬虫数据上报接口文档.md
Normal file
593
docs/爬虫数据上报接口文档.md
Normal file
@ -0,0 +1,593 @@
|
||||
# 爬虫数据上报接口文档
|
||||
|
||||
> 适用版本:JobData v1.x
|
||||
> 更新日期:2026-03-20
|
||||
> 目标读者:接手爬虫开发或后端对接的工程师
|
||||
|
||||
---
|
||||
|
||||
## 目录
|
||||
|
||||
1. [整体架构](#1-整体架构)
|
||||
2. [认证方式](#2-认证方式)
|
||||
3. [核心上报接口](#3-核心上报接口)
|
||||
- 3.1 [异步批量上报(推荐)](#31-异步批量上报推荐)
|
||||
- 3.2 [同步批量上报](#32-同步批量上报)
|
||||
- 3.3 [同步单条上报](#33-同步单条上报)
|
||||
- 3.4 [平台专属便捷接口](#34-平台专属便捷接口)
|
||||
4. [各平台数据结构](#4-各平台数据结构)
|
||||
- 4.1 [BOSS直聘](#41-boss直聘-platformboss)
|
||||
- 4.2 [前程无忧](#42-前程无忧-platformqcwy)
|
||||
- 4.3 [智联招聘](#43-智联招聘-platformzhilian)
|
||||
5. [去重规则](#5-去重规则)
|
||||
6. [爬虫调用示例](#6-爬虫调用示例)
|
||||
7. [辅助接口](#7-辅助接口)
|
||||
8. [数据存储说明](#8-数据存储说明)
|
||||
9. [常见问题](#9-常见问题)
|
||||
|
||||
---
|
||||
|
||||
## 1. 整体架构
|
||||
|
||||
```
|
||||
爬虫(Boss / 前程无忧 / 智联)
|
||||
│
|
||||
│ POST /api/v1/universal/data/batch-store-async
|
||||
▼
|
||||
FastAPI 后端(app/)
|
||||
│
|
||||
├── 去重检查(ClickHouse 查最近 90 天)
|
||||
├── 写入 ClickHouse(job_data 库)
|
||||
└── 转发至外部数据平台(qixin.com)
|
||||
```
|
||||
|
||||
三个平台的爬虫**调用同一套接口**,通过 `platform` 字段区分来源,通过 `data_type` 字段区分数据类型(职位/公司)。
|
||||
|
||||
---
|
||||
|
||||
## 2. 认证方式
|
||||
|
||||
数据上报接口属于**内部接口,无需鉴权**。
|
||||
|
||||
爬虫调用时统一在 Header 中携带:
|
||||
|
||||
```
|
||||
token: dev
|
||||
```
|
||||
|
||||
> 说明:`dev` 是开发模式 Token,后端不验证签名,直接放行。生产部署如需启用鉴权,改用 JWT Token(HS256,有效期 7 天)。
|
||||
|
||||
---
|
||||
|
||||
## 3. 核心上报接口
|
||||
|
||||
**Base URL**(本地开发):`http://localhost:8000`
|
||||
|
||||
两个路径前缀完全等价,行为相同:
|
||||
- `/api/v1/universal`
|
||||
- `/api/v1/job`
|
||||
|
||||
---
|
||||
|
||||
### 3.1 异步批量上报(推荐)
|
||||
|
||||
**三个平台爬虫均使用此接口**,立即返回 202,后台异步写入。
|
||||
|
||||
```
|
||||
POST /api/v1/universal/data/batch-store-async
|
||||
```
|
||||
|
||||
**Request Headers**
|
||||
|
||||
```
|
||||
Content-Type: application/json
|
||||
token: dev
|
||||
```
|
||||
|
||||
**Request Body**
|
||||
|
||||
```json
|
||||
{
|
||||
"data_list": [
|
||||
{ ...原始职位或公司 JSON... }
|
||||
],
|
||||
"data_type": "job",
|
||||
"platform": "boss",
|
||||
"check_duplicate": true
|
||||
}
|
||||
```
|
||||
|
||||
| 字段 | 类型 | 必填 | 说明 |
|
||||
|------|------|:----:|------|
|
||||
| `data_list` | `List[Dict]` | ✅ | 原始数据列表,结构见第 4 节 |
|
||||
| `data_type` | `string` | ✅ | `job`(职位)或 `company`(公司) |
|
||||
| `platform` | `string` | ✅ | `boss` / `qcwy` / `zhilian` |
|
||||
| `check_duplicate` | `bool` | ❌ | 默认 `true`,`false` 时跳过去重直接写入 |
|
||||
|
||||
**Response(HTTP 202)**
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 202,
|
||||
"message": "批量数据已加入异步处理队列,共 10 条",
|
||||
"platform": "boss",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3.2 同步批量上报
|
||||
|
||||
同步等待全部写入完成后返回,可获得详细的成功/失败统计。
|
||||
|
||||
```
|
||||
POST /api/v1/universal/data/batch-store
|
||||
```
|
||||
|
||||
**Request Body** — 与 3.1 相同。
|
||||
|
||||
**Response(HTTP 200)**
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 200,
|
||||
"message": "批量处理完成: 成功 8 条,失败 0 条,重复 2 条",
|
||||
"data": {
|
||||
"total": 10,
|
||||
"success": 8,
|
||||
"failed": 0,
|
||||
"duplicate": 2,
|
||||
"errors": []
|
||||
},
|
||||
"platform": "boss",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 3.3 同步单条上报
|
||||
|
||||
```
|
||||
POST /api/v1/universal/data/store
|
||||
```
|
||||
|
||||
**Request Body**
|
||||
|
||||
```json
|
||||
{
|
||||
"data": { ...单条原始 JSON... },
|
||||
"data_type": "job",
|
||||
"platform": "boss",
|
||||
"check_duplicate": true
|
||||
}
|
||||
```
|
||||
|
||||
注意:字段名是 `data`(单条),不是 `data_list`。
|
||||
|
||||
**Response(HTTP 200)**
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 200,
|
||||
"message": "JSON数据存储成功",
|
||||
"data": {
|
||||
"success": true,
|
||||
"message": "JSON数据存储成功",
|
||||
"duplicate": false,
|
||||
"table_name": "boss_job",
|
||||
"storage_type": "json"
|
||||
},
|
||||
"platform": "boss",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
数据重复时 `duplicate` 为 `true`,`message` 为 `"数据重复,跳过插入"`,HTTP 仍返回 200。
|
||||
|
||||
---
|
||||
|
||||
### 3.4 平台专属便捷接口
|
||||
|
||||
Request Body 直接传原始 JSON 对象(无需包装 `platform`/`data_type`),等价于 3.3 的 `data` 字段:
|
||||
|
||||
| URL | 平台 | 类型 |
|
||||
|-----|------|------|
|
||||
| `POST /api/v1/job/boss/job` | BOSS直聘 | 职位 |
|
||||
| `POST /api/v1/job/boss/company` | BOSS直聘 | 公司 |
|
||||
| `POST /api/v1/job/qcwy/job` | 前程无忧 | 职位 |
|
||||
| `POST /api/v1/job/qcwy/company` | 前程无忧 | 公司 |
|
||||
| `POST /api/v1/job/zhilian/job` | 智联招聘 | 职位 |
|
||||
| `POST /api/v1/job/zhilian/company` | 智联招聘 | 公司 |
|
||||
|
||||
---
|
||||
|
||||
## 4. 各平台数据结构
|
||||
|
||||
> 以下为 `data_list` 中每个元素的结构,即各平台原始 API 响应体(直接透传,无需转换)。
|
||||
|
||||
---
|
||||
|
||||
### 4.1 BOSS直聘(platform=boss)
|
||||
|
||||
#### 职位(data_type=job)
|
||||
|
||||
数据来源:BOSS 微信小程序接口 `/wapi/zpgeek/miniapp/job/detail.json`
|
||||
|
||||
```json
|
||||
{
|
||||
"jobBaseInfoVO": {
|
||||
"jobId": "123456",
|
||||
"encryptJobId": "abc123",
|
||||
"positionName": "Python 工程师",
|
||||
"locationName": "上海",
|
||||
"locationDesc": "上海市浦东新区XX路XX号",
|
||||
"jobDesc": "负责数据采集与处理...",
|
||||
"degreeName": "本科",
|
||||
"experienceName": "3-5年",
|
||||
"lowSalary": 15,
|
||||
"highSalary": 25,
|
||||
"requiredSkills": ["Python", "爬虫", "ClickHouse"],
|
||||
"salaryWelfareInfo": ["五险一金", "弹性工作"]
|
||||
},
|
||||
"brandComInfoVO": {
|
||||
"encryptBrandId": "brand_abc",
|
||||
"brandName": "某科技有限公司",
|
||||
"industryName": "互联网",
|
||||
"scaleName": "100-499人",
|
||||
"introduce": "公司简介..."
|
||||
},
|
||||
"bossBaseInfoVO": {
|
||||
"brandName": "张HR"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**关键去重字段**:`jobBaseInfoVO.jobId`
|
||||
|
||||
#### 公司(data_type=company)
|
||||
|
||||
数据来源:BOSS 微信小程序接口 `/wapi/zpgeek/miniapp/brand/detail.json`
|
||||
|
||||
```json
|
||||
{
|
||||
"name": "某科技有限公司",
|
||||
"companyFullInfoVO": {
|
||||
"name": "某科技有限公司(全称)"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**关键去重字段**:`name` 或 `companyFullInfoVO.name`(取 company_name)
|
||||
|
||||
---
|
||||
|
||||
### 4.2 前程无忧(platform=qcwy)
|
||||
|
||||
#### 职位(data_type=job)
|
||||
|
||||
数据来源:前程无忧 APP 接口
|
||||
|
||||
```json
|
||||
{
|
||||
"jobId": "JL123456789",
|
||||
"updateDateTime": "2026-03-20 10:00:00",
|
||||
"jobName": "数据工程师",
|
||||
"companyName": "某公司",
|
||||
"fullCompanyName": "某公司全称有限公司",
|
||||
"coId": "CO123456",
|
||||
"jobDescribe": "岗位职责:...",
|
||||
"degreeString": "本科",
|
||||
"workYearString": "3-5年",
|
||||
"jobSalaryMax": 20000,
|
||||
"jobSalaryMin": 15000,
|
||||
"provideSalaryString": "15k-20k",
|
||||
"termStr": "全职",
|
||||
"companySizeString": "500-999人",
|
||||
"companyTypeString": "民营企业",
|
||||
"major1Str": "互联网/电子商务",
|
||||
"major2Str": "数据服务",
|
||||
"jobWelfareCodeDataList": [
|
||||
{ "chineseTitle": "五险一金", "typeTitle": "社保", "code": "001" }
|
||||
],
|
||||
"jobTagsForOrder": ["Python", "Spark", "Hive"],
|
||||
"location": "上海",
|
||||
"workLocation": {
|
||||
"workAddress": "浦东新区",
|
||||
"address": "上海市浦东新区XX路"
|
||||
},
|
||||
"jobAreaString": "上海",
|
||||
"jobAreaLevelDetail": {
|
||||
"cityString": "上海",
|
||||
"landMarkString": "陆家嘴"
|
||||
},
|
||||
"confirmDateString": "2026-03-20",
|
||||
"jobHref": "https://www.51job.com/...",
|
||||
"companyHref": "https://www.51job.com/..."
|
||||
}
|
||||
```
|
||||
|
||||
**关键去重字段**:`jobId` + `updateDateTime`(两字段联合唯一)
|
||||
|
||||
#### 公司(data_type=company)
|
||||
|
||||
```json
|
||||
{
|
||||
"companyName": "某公司",
|
||||
"fullCompanyName": "某公司全称有限公司"
|
||||
}
|
||||
```
|
||||
|
||||
**关键去重字段**:`companyName`
|
||||
|
||||
---
|
||||
|
||||
### 4.3 智联招聘(platform=zhilian)
|
||||
|
||||
#### 职位(data_type=job)
|
||||
|
||||
数据来源:智联招聘 PC 搜索接口 `https://fe-api.zhaopin.com/c/i/search/positions`
|
||||
|
||||
```json
|
||||
{
|
||||
"number": "ZL20260320001",
|
||||
"firstPublishTime": "2026-03-20T10:00:00",
|
||||
"name": "后端开发工程师",
|
||||
"jobId": "J001",
|
||||
"companyName": "某公司",
|
||||
"companyId": "C001",
|
||||
"salary60": "15k-25k",
|
||||
"jobSummary": "职位描述:负责后端服务开发...",
|
||||
"education": "本科",
|
||||
"workingExp": "3-5年",
|
||||
"workType": "全职",
|
||||
"workCity": "上海",
|
||||
"cityDistrict": "浦东新区",
|
||||
"companySize": "500-999人",
|
||||
"propertyName": "民营企业",
|
||||
"industryName": "互联网",
|
||||
"skillLabel": [
|
||||
{ "value": "Go" },
|
||||
{ "value": "Python" }
|
||||
],
|
||||
"recruitNumber": 3,
|
||||
"positionURL": "https://www.zhaopin.com/...",
|
||||
"companyUrl": "https://www.zhaopin.com/...",
|
||||
"companyDesc": "公司描述(从额外接口补充)"
|
||||
}
|
||||
```
|
||||
|
||||
**关键去重字段**:`number` + `firstPublishTime`(两字段联合唯一)
|
||||
|
||||
#### 公司(data_type=company)
|
||||
|
||||
```json
|
||||
{
|
||||
"companyName": "某公司",
|
||||
"name": "某公司"
|
||||
}
|
||||
```
|
||||
|
||||
**关键去重字段**:`companyName` 或 `name`
|
||||
|
||||
---
|
||||
|
||||
## 5. 去重规则
|
||||
|
||||
| 平台 | 数据类型 | 去重字段 | ClickHouse 表 |
|
||||
|------|----------|----------|---------------|
|
||||
| boss | job | `jobBaseInfoVO.jobId` | `boss_job` |
|
||||
| boss | company | `name` / `companyFullInfoVO.name` | `boss_company` |
|
||||
| qcwy | job | `jobId` + `updateDateTime` | `qcwy_job` |
|
||||
| qcwy | company | `companyName` | `qcwy_company` |
|
||||
| zhilian | job | `number` + `firstPublishTime` | `zhilian_job` |
|
||||
| zhilian | company | `companyName` / `name` | `zhilian_company` |
|
||||
|
||||
- 去重检查范围:**最近 90 天**内已入库的记录。
|
||||
- 重复数据不报错,正常返回 200,`duplicate: true`。
|
||||
- 传 `check_duplicate: false` 可跳过去重,强制写入(测试时使用)。
|
||||
|
||||
---
|
||||
|
||||
## 6. 爬虫调用示例
|
||||
|
||||
### BOSS直聘(`jobs_spider/boss/boos_api.py`)
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
API_BASE_URL = "http://localhost:8000"
|
||||
|
||||
def push_job(zp_data: dict):
|
||||
"""推送职位数据"""
|
||||
payload = {
|
||||
"data_list": [zp_data],
|
||||
"data_type": "job",
|
||||
"platform": "boss"
|
||||
}
|
||||
resp = requests.post(
|
||||
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
|
||||
headers={
|
||||
"accept": "application/json",
|
||||
"token": "dev",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json=payload,
|
||||
timeout=30
|
||||
)
|
||||
return resp.json()
|
||||
|
||||
|
||||
def push_company(zp_data: dict):
|
||||
"""推送公司数据"""
|
||||
payload = {
|
||||
"data_list": [zp_data],
|
||||
"data_type": "company",
|
||||
"platform": "boss"
|
||||
}
|
||||
resp = requests.post(
|
||||
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
|
||||
headers={
|
||||
"accept": "application/json",
|
||||
"token": "dev",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json=payload,
|
||||
timeout=30
|
||||
)
|
||||
return resp.json()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 前程无忧(`jobs_spider/qcwy/qcwy.py`)
|
||||
|
||||
```python
|
||||
import requests
|
||||
import socket
|
||||
|
||||
API_BASE_URL = "http://localhost:8000"
|
||||
local_ip = socket.gethostbyname(socket.gethostname())
|
||||
|
||||
def report_data(data: list, data_type: str = "job"):
|
||||
"""批量上报数据"""
|
||||
payload = {
|
||||
"data_list": data,
|
||||
"data_type": data_type, # "job" 或 "company"
|
||||
"platform": "qcwy"
|
||||
}
|
||||
resp = requests.post(
|
||||
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
|
||||
json=payload,
|
||||
headers={
|
||||
"accept": "application/json",
|
||||
"Content-Type": "application/json",
|
||||
"X-Forwarded-For": local_ip # 传递真实 IP,用于日志溯源
|
||||
},
|
||||
timeout=300
|
||||
)
|
||||
return resp.json()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### 智联招聘(`jobs_spider/zhilian/zhilian_single.py`)
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
API_BASE_URL = "http://localhost:8000"
|
||||
|
||||
def report_data(data_list: list, data_type: str = "job"):
|
||||
"""批量上报数据"""
|
||||
payload = {
|
||||
"data_list": data_list,
|
||||
"data_type": data_type, # "job" 或 "company"
|
||||
"platform": "zhilian"
|
||||
}
|
||||
resp = requests.post(
|
||||
f"{API_BASE_URL}/api/v1/universal/data/batch-store-async",
|
||||
json=payload,
|
||||
headers={
|
||||
"accept": "application/json",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
timeout=300
|
||||
)
|
||||
return resp.json()
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. 辅助接口
|
||||
|
||||
爬虫运行过程中还会调用以下辅助接口:
|
||||
|
||||
| 接口 | 说明 | 主要使用方 |
|
||||
|------|------|-----------|
|
||||
| `GET /api/v1/token/tokens?page=1&page_size=10` | 获取可用的 MPT Token 列表 | BOSS爬虫 |
|
||||
| `GET /api/v1/keyword/available?source=boss&limit=1&reserve=True` | 获取下一个未使用的关键词(城市+职位组合) | BOSS爬虫 |
|
||||
| `POST /api/v1/keyword/mark-used` | 标记关键词已使用 | BOSS爬虫 |
|
||||
| `GET /api/v1/stats` | 查询各平台已入库数据量 | 监控/运营 |
|
||||
| `GET /api/v1/platforms` | 查询支持的平台列表及去重字段配置 | 调试 |
|
||||
| `GET /api/v1/universal/data?platform=boss&data_type=job&page=1&page_size=20` | 分页查询已入库数据 | 调试 |
|
||||
|
||||
### 标记关键词已使用 Request Body
|
||||
|
||||
```json
|
||||
{
|
||||
"source": "boss",
|
||||
"ids": [1, 2, 3]
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. 数据存储说明
|
||||
|
||||
### ClickHouse 表结构
|
||||
|
||||
所有表均在 `job_data` 数据库下,ENGINE = `MergeTree()`。
|
||||
|
||||
**通用列(每张表都有):**
|
||||
|
||||
| 列名 | 类型 | 说明 |
|
||||
|------|------|------|
|
||||
| `id` | UInt64 | 自增 ID |
|
||||
| `json_data` | String | 原始 JSON 字符串(完整保存) |
|
||||
| `created_at` | DateTime | 入库时间 |
|
||||
| `updated_at` | DateTime | 更新时间 |
|
||||
|
||||
**各表额外列(用于去重查询):**
|
||||
|
||||
| 表名 | 额外列 |
|
||||
|------|--------|
|
||||
| `boss_job` | `job_id String` |
|
||||
| `boss_company` | `company_name String` |
|
||||
| `qcwy_job` | `job_id String`, `update_date_time String` |
|
||||
| `qcwy_company` | `company_name String` |
|
||||
| `zhilian_job` | `number String`, `first_publish_time String` |
|
||||
| `zhilian_company` | `company_name String` |
|
||||
|
||||
### 统一查询视图
|
||||
|
||||
`job_analytics` 视图 UNION ALL 三张职位表,提供统一查询入口:
|
||||
|
||||
| 列名 | 说明 |
|
||||
|------|------|
|
||||
| `source` | 平台来源(boss/qcwy/zhilian) |
|
||||
| `job_id` | 职位唯一标识 |
|
||||
| `position_name` | 职位名称 |
|
||||
| `company_name` | 公司名称 |
|
||||
| `salary_text` | 薪资描述 |
|
||||
| `city` | 城市 |
|
||||
| `experience_required` | 经验要求 |
|
||||
| `education` | 学历要求 |
|
||||
| `created_at` | 入库时间 |
|
||||
|
||||
---
|
||||
|
||||
## 9. 常见问题
|
||||
|
||||
**Q:上报返回 202,但数据库里查不到数据?**
|
||||
A:异步接口的写入有延迟(通常 1-5 秒)。改用同步接口 `batch-store` 可立即确认写入结果。
|
||||
|
||||
**Q:如何判断某条数据是否已存在?**
|
||||
A:调用同步单条上报接口,响应中 `duplicate: true` 表示已存在。
|
||||
|
||||
**Q:`check_duplicate: false` 会导致重复数据吗?**
|
||||
A:会。仅在测试/调试时使用,生产环境保持默认 `true`。
|
||||
|
||||
**Q:三个平台的数据结构差异大,如何统一分析?**
|
||||
A:使用 `job_analytics` 视图,已将三张表的字段映射为统一列名。
|
||||
|
||||
**Q:爬虫报超时错误怎么处理?**
|
||||
A:异步接口 timeout 建议设 30s,同步接口因要等待写入完成,建议设 300s。若仍超时,检查 ClickHouse 连接状态。
|
||||
|
||||
**Q:`token: dev` 在生产环境安全吗?**
|
||||
A:不安全。生产环境应替换为 JWT Token,并在接口上挂载鉴权中间件。
|
||||
|
||||
---
|
||||
|
||||
*文档由 JobData 项目自动生成,如有疑问联系项目维护者。*
|
||||
38
ecs_full_pipeline.log
Normal file
38
ecs_full_pipeline.log
Normal file
@ -0,0 +1,38 @@
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 09E71CDD-F721-589B-BF1C-15B63EAD78EC
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=09E71CDD-F721-589B-BF1C-15B63EAD78EC
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-21T12:30:00.005008
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: A14B7A5D-A924-586D-AA89-4D113D5DA2C7
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=A14B7A5D-A924-586D-AA89-4D113D5DA2C7
|
||||
未获得实例ID,终止
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 577C72C7-7099-517D-A96E-1EE59220AAB3
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=577C72C7-7099-517D-A96E-1EE59220AAB3
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-22T12:30:00.015339
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 49E530D3-304A-57CE-9FD4-DCE0F2824B20
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=49E530D3-304A-57CE-9FD4-DCE0F2824B20
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-22T18:30:00.007107
|
||||
@ -7,7 +7,9 @@ from loguru import logger
|
||||
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.services.crawler.qcwy import QcwyService
|
||||
from app.services.job import DataRouterService, DataType, PlatformType
|
||||
from app.services.ingest import IngestService
|
||||
from app.services.ingest.remote_push import push_to_remote
|
||||
from app.services.ingest.configs.qcwy import _build_qcwy_push
|
||||
from app.settings.config import settings
|
||||
|
||||
# 提取 jobId 的正则表达式
|
||||
@ -16,12 +18,12 @@ JOB_ID_REGEX = re.compile(r'/(\d+)\.html')
|
||||
class LinkRecleaner:
|
||||
def __init__(self):
|
||||
self.qcwy_service = QcwyService()
|
||||
self.data_router = None
|
||||
self.ingest_service = None
|
||||
self.semaphore = asyncio.Semaphore(50) # 限制并发
|
||||
|
||||
async def init(self):
|
||||
ch_client = await clickhouse_manager.get_client()
|
||||
self.data_router = DataRouterService(ch_client)
|
||||
self.ingest_service = IngestService(ch_client)
|
||||
|
||||
async def get_job_id_from_url(self, url: str) -> Optional[str]:
|
||||
match = JOB_ID_REGEX.search(url)
|
||||
@ -58,8 +60,8 @@ class LinkRecleaner:
|
||||
source = "Crawler"
|
||||
if data:
|
||||
# 存入数据库供下次使用
|
||||
await self.data_router.store_data(
|
||||
data, DataType.JOB, PlatformType.QCWY, check_duplicate=True
|
||||
await self.ingest_service.store_single(
|
||||
"qcwy", "mini", "job", data, check_duplicate=True
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Crawl failed for {job_id}: {e}")
|
||||
@ -84,14 +86,12 @@ class LinkRecleaner:
|
||||
|
||||
# 4. 准备推送数据
|
||||
try:
|
||||
remote_data = await self.data_router._prepare_remote_push_data(
|
||||
data, DataType.JOB, PlatformType.QCWY
|
||||
)
|
||||
|
||||
remote_data = _build_qcwy_push(data)
|
||||
|
||||
if remote_data:
|
||||
# 5. 发送到第三方
|
||||
success = await self.data_router.send_to_remote_server(remote_data)
|
||||
status = "✅ Success" if success else "❌ Failed"
|
||||
success = await push_to_remote(remote_data)
|
||||
status = "Success" if success else "Failed"
|
||||
logger.info(f"[{source}] Push {job_id}: {status}")
|
||||
return success
|
||||
else:
|
||||
|
||||
2
run.py
2
run.py
@ -13,5 +13,5 @@ if __name__ == "__main__":
|
||||
|
||||
host = os.getenv("APP_HOST", "0.0.0.0")
|
||||
port = int(os.getenv("APP_PORT", "9999"))
|
||||
workers = int(os.getenv("UVICORN_WORKERS", "20"))
|
||||
workers = int(os.getenv("UVICORN_WORKERS", "1"))
|
||||
uvicorn.run("app:app", host=host, port=port, workers=workers, log_config=LOGGING_CONFIG)
|
||||
|
||||
154
spiderJobs/core/base.py
Normal file
154
spiderJobs/core/base.py
Normal file
@ -0,0 +1,154 @@
|
||||
"""
|
||||
core.base - 通用基类与数据结构
|
||||
提供所有招聘平台共用的:ApiResult, BaseFetcher, BaseSearcher
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from spiderJobs.core.http_client import HTTPClient
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 通用数据结构
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ApiResult:
|
||||
"""所有接口的统一返回结构"""
|
||||
success: bool
|
||||
status_code: int
|
||||
data: Any = None
|
||||
list: list[dict] = field(default_factory=list)
|
||||
count: int = 0
|
||||
is_end_page: bool = True
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 通用响应解析(可覆写)
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
def parse_response(http_code: int, raw: Any) -> ApiResult:
|
||||
"""
|
||||
默认响应解析算法
|
||||
|
||||
各平台如果格式不同,可在子类中覆写 parse_response 方法
|
||||
"""
|
||||
biz_code = raw.get("statusCode") if isinstance(raw, dict) else http_code
|
||||
|
||||
if http_code != 200 or biz_code != 200:
|
||||
return ApiResult(
|
||||
success=False,
|
||||
status_code=biz_code or http_code,
|
||||
error=(
|
||||
raw.get("statusDescription")
|
||||
or raw.get("message")
|
||||
or f"请求失败: {biz_code}"
|
||||
) if isinstance(raw, dict) else f"请求失败: {http_code}",
|
||||
)
|
||||
|
||||
payload = (raw.get("data") or {}) if isinstance(raw, dict) else {}
|
||||
|
||||
if isinstance(payload, dict) and "list" in payload:
|
||||
return ApiResult(
|
||||
success=True, status_code=200, data=payload,
|
||||
list=payload.get("list", []),
|
||||
count=payload.get("count", 0),
|
||||
is_end_page=payload.get("isEndPage", True),
|
||||
)
|
||||
|
||||
return ApiResult(success=True, status_code=200, data=payload)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 基础 Fetcher(GET 详情类)
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
class BaseFetcher:
|
||||
"""
|
||||
单对象接口基类(GET 请求)
|
||||
|
||||
子类需实现:
|
||||
ENDPOINT: 接口路径
|
||||
_build_params(): 构建查询参数
|
||||
可覆写:
|
||||
parse_response(): 自定义响应解析
|
||||
"""
|
||||
ENDPOINT: str = ""
|
||||
|
||||
def __init__(self, http_client: HTTPClient):
|
||||
self._http = http_client
|
||||
|
||||
def _build_params(self) -> dict:
|
||||
raise NotImplementedError
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return parse_response(http_code, raw)
|
||||
|
||||
def fetch(self) -> ApiResult:
|
||||
try:
|
||||
http_code, data = self._http.get(self.ENDPOINT, self._build_params())
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 基础 Searcher(搜索 + 分页类)
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
class BaseSearcher:
|
||||
"""
|
||||
列表接口基类(支持分页)
|
||||
|
||||
子类需实现:
|
||||
ENDPOINT: 接口路径
|
||||
_build_params(page_index): 构建请求参数
|
||||
可覆写:
|
||||
_request(params): 默认 POST,可改为 GET
|
||||
_parse(): 自定义响应解析
|
||||
"""
|
||||
ENDPOINT: str = ""
|
||||
|
||||
def __init__(self, page_size: int = 15, http_client: HTTPClient = None):
|
||||
self.page_size = page_size
|
||||
self._http = http_client
|
||||
|
||||
def _build_params(self, page_index: int) -> dict:
|
||||
raise NotImplementedError
|
||||
|
||||
def _request(self, params: dict) -> tuple[int, Any]:
|
||||
return self._http.post(self.ENDPOINT, params)
|
||||
|
||||
def _parse(self, http_code: int, raw: Any) -> ApiResult:
|
||||
return parse_response(http_code, raw)
|
||||
|
||||
def search(self, page_index: int = 1) -> ApiResult:
|
||||
params = self._build_params(page_index)
|
||||
try:
|
||||
http_code, data = self._request(params)
|
||||
except Exception as e:
|
||||
return ApiResult(success=False, status_code=-1, error=str(e))
|
||||
return self._parse(http_code, data)
|
||||
|
||||
def load_all(
|
||||
self,
|
||||
max_pages: int = 10,
|
||||
on_page: Optional[Callable[[ApiResult, int], None]] = None,
|
||||
) -> list[dict]:
|
||||
"""分页累积加载"""
|
||||
all_list: list[dict] = []
|
||||
for page_index in range(1, max_pages + 1):
|
||||
result = self.search(page_index=page_index)
|
||||
if not result.success:
|
||||
print(f"第 {page_index} 页失败: {result.error}")
|
||||
break
|
||||
all_list.extend(result.list)
|
||||
if on_page:
|
||||
on_page(result, page_index)
|
||||
if result.is_end_page:
|
||||
break
|
||||
return all_list
|
||||
155
spiderJobs/core/http_client.py
Normal file
155
spiderJobs/core/http_client.py
Normal file
@ -0,0 +1,155 @@
|
||||
"""
|
||||
core.http_client - 通用 HTTP 客户端
|
||||
基于 requests-go,自带 Chrome TLS 指纹伪装
|
||||
支持代理 IP / 隧道代理 / 代理池轮换
|
||||
与任何招聘平台无关,纯粹负责发请求
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests_go as requests
|
||||
from requests_go.tls_config import TLS_CHROME_LATEST
|
||||
|
||||
|
||||
class HTTPClient:
|
||||
"""
|
||||
通用 HTTP 客户端
|
||||
|
||||
Args:
|
||||
base_url: API 基础地址
|
||||
default_headers: 默认请求头
|
||||
proxy: 固定代理地址(绑定到 session,复用连接)
|
||||
tunnel_proxy: 隧道代理地址(每次请求新建 session,确保 IP 轮换)
|
||||
proxy_pool: 代理池列表(每次请求随机选一个)
|
||||
timeout: 请求超时秒数(默认 10)
|
||||
|
||||
代理优先级: tunnel_proxy > proxy_pool > proxy
|
||||
三者只用其一即可。
|
||||
|
||||
代理格式示例:
|
||||
普通代理: "http://127.0.0.1:7890"
|
||||
SOCKS5 代理: "socks5://127.0.0.1:1080"
|
||||
隧道代理: "http://user:pass@tunnel.example.com:12345"
|
||||
隧道代理(认证): "http://account-zone-xxx:password@proxy.host:port"
|
||||
|
||||
隧道代理用法(每次请求自动换 IP):
|
||||
client = HTTPClient(
|
||||
base_url="https://example.com",
|
||||
tunnel_proxy="http://user:pass@tunnel.example.com:12345",
|
||||
)
|
||||
# 每次 get/post 都会新建 TCP 连接,隧道代理自动分配新 IP
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str,
|
||||
default_headers: Optional[dict] = None,
|
||||
proxy: Optional[str] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
timeout: int = 10,
|
||||
):
|
||||
self.base_url = base_url
|
||||
self.default_headers = default_headers or {}
|
||||
self.timeout = timeout
|
||||
|
||||
# 代理配置
|
||||
self._proxy = proxy
|
||||
self._tunnel_proxy = tunnel_proxy
|
||||
self._proxy_pool = proxy_pool
|
||||
|
||||
# 创建 session + TLS 指纹
|
||||
self._session = requests.Session()
|
||||
self._session.tls_config = TLS_CHROME_LATEST
|
||||
TLS_CHROME_LATEST.random_ja3 = True
|
||||
|
||||
# 固定代理直接设到 session 上
|
||||
if proxy and not proxy_pool and not tunnel_proxy:
|
||||
self._session.proxies = {"http": proxy, "https": proxy}
|
||||
|
||||
def _new_session(self) -> requests.Session:
|
||||
"""创建全新 session(用于隧道代理 IP 轮换)"""
|
||||
s = requests.Session()
|
||||
s.tls_config = TLS_CHROME_LATEST
|
||||
TLS_CHROME_LATEST.random_ja3 = True
|
||||
return s
|
||||
|
||||
def _get_proxies(self) -> Optional[dict]:
|
||||
"""获取本次请求的代理配置"""
|
||||
if self._proxy_pool:
|
||||
# 代理池:随机选一个,加 #random_hash 打破连接复用
|
||||
chosen = random.choice(self._proxy_pool)
|
||||
unique = f"{chosen}#{random.randint(100000, 999999)}"
|
||||
return {"http": unique, "https": unique}
|
||||
return None # 固定代理已在 session 上,不需要每次传
|
||||
|
||||
def _merge_headers(self, extra: Optional[dict] = None) -> dict:
|
||||
headers = {**self.default_headers}
|
||||
if extra:
|
||||
headers.update(extra)
|
||||
return headers
|
||||
|
||||
def post(self, path: str, body: dict, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
"""发送 POST 请求"""
|
||||
merged_headers = self._merge_headers(headers)
|
||||
|
||||
# 隧道代理:每次新 session,确保 IP 轮换
|
||||
if self._tunnel_proxy:
|
||||
s = self._new_session()
|
||||
try:
|
||||
resp = s.post(
|
||||
f"{self.base_url}{path}",
|
||||
json=body,
|
||||
headers=merged_headers,
|
||||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
return resp.status_code, resp.json()
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"json": body,
|
||||
"headers": merged_headers,
|
||||
"timeout": self.timeout,
|
||||
}
|
||||
proxies = self._get_proxies()
|
||||
if proxies:
|
||||
kwargs["proxies"] = proxies
|
||||
resp = self._session.post(f"{self.base_url}{path}", **kwargs)
|
||||
return resp.status_code, resp.json()
|
||||
|
||||
def get(self, path: str, params: Optional[dict] = None, headers: Optional[dict] = None) -> tuple[int, Any]:
|
||||
"""发送 GET 请求"""
|
||||
merged_headers = self._merge_headers(headers)
|
||||
|
||||
# 隧道代理:每次新 session,确保 IP 轮换
|
||||
if self._tunnel_proxy:
|
||||
s = self._new_session()
|
||||
try:
|
||||
resp = s.get(
|
||||
f"{self.base_url}{path}",
|
||||
params=params,
|
||||
headers=merged_headers,
|
||||
proxies={"http": self._tunnel_proxy, "https": self._tunnel_proxy},
|
||||
timeout=self.timeout,
|
||||
)
|
||||
return resp.status_code, resp.json()
|
||||
finally:
|
||||
s.close()
|
||||
|
||||
kwargs: dict[str, Any] = {
|
||||
"params": params,
|
||||
"headers": merged_headers,
|
||||
"timeout": self.timeout,
|
||||
}
|
||||
proxies = self._get_proxies()
|
||||
if proxies:
|
||||
kwargs["proxies"] = proxies
|
||||
resp = self._session.get(f"{self.base_url}{path}", **kwargs)
|
||||
return resp.status_code, resp.json()
|
||||
|
||||
|
||||
0
spiderJobs/platforms/__init__.py
Normal file
0
spiderJobs/platforms/__init__.py
Normal file
1
spiderJobs/platforms/boss/__init__.py
Normal file
1
spiderJobs/platforms/boss/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
|
||||
68
spiderJobs/platforms/boss/company_main.py
Normal file
68
spiderJobs/platforms/boss/company_main.py
Normal file
@ -0,0 +1,68 @@
|
||||
"""
|
||||
Boss直聘 独立公司爬虫入口
|
||||
|
||||
从 pending_company 队列获取待爬取的 Boss 公司,
|
||||
逐个调用 GetBrandDetail 获取详情并上传。
|
||||
|
||||
启动:
|
||||
python -m spiderJobs.platforms.boss.company_main
|
||||
|
||||
环境变量:
|
||||
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
|
||||
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
|
||||
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
|
||||
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
|
||||
BOSS_MPT Boss Token (mpt)
|
||||
BOSS_WT2 Boss Token (wt2)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
if _project_root not in sys.path:
|
||||
sys.path.insert(0, _project_root)
|
||||
|
||||
from spiderJobs.core.base import BaseFetcher
|
||||
from spiderJobs.platforms.boss.api import GetBrandDetail
|
||||
from spiderJobs.platforms.boss.client import BossClient, create_client
|
||||
from spiderJobs.platforms.boss.sign import BossSign
|
||||
from spiderJobs.runner.company_loop import run_company_loop
|
||||
|
||||
|
||||
def create_company_fetcher(company_id: str, http_client: BossClient) -> BaseFetcher:
|
||||
"""创建 Boss 公司详情 fetcher"""
|
||||
return GetBrandDetail(brand_id=company_id, client=http_client)
|
||||
|
||||
|
||||
def main():
|
||||
client_kwargs = {}
|
||||
|
||||
mpt = os.environ.get("BOSS_MPT", "")
|
||||
wt2 = os.environ.get("BOSS_WT2", "")
|
||||
if mpt or wt2:
|
||||
signer = BossSign(mpt=mpt, wt2=wt2)
|
||||
client_kwargs["signer"] = signer
|
||||
|
||||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
||||
if tunnel:
|
||||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
||||
username = os.environ.get("PROXY_USERNAME", "")
|
||||
password = os.environ.get("PROXY_PASSWORD", "")
|
||||
if username and password:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
||||
else:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
||||
|
||||
run_company_loop(
|
||||
platform="boss",
|
||||
create_company_fetcher=create_company_fetcher,
|
||||
create_client_fn=create_client,
|
||||
client_kwargs=client_kwargs,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1
spiderJobs/platforms/job51/__init__.py
Normal file
1
spiderJobs/platforms/job51/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
|
||||
59
spiderJobs/platforms/job51/company_main.py
Normal file
59
spiderJobs/platforms/job51/company_main.py
Normal file
@ -0,0 +1,59 @@
|
||||
"""
|
||||
前程无忧 (51Job) 独立公司爬虫入口
|
||||
|
||||
从 pending_company 队列获取待爬取的 51job 公司,
|
||||
逐个调用 GetCompanyInfo 获取详情并上传。
|
||||
|
||||
启动:
|
||||
python -m spiderJobs.platforms.job51.company_main
|
||||
|
||||
环境变量:
|
||||
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
|
||||
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
|
||||
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
|
||||
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
if _project_root not in sys.path:
|
||||
sys.path.insert(0, _project_root)
|
||||
|
||||
from spiderJobs.core.base import BaseFetcher
|
||||
from spiderJobs.platforms.job51.api import GetCompanyInfo
|
||||
from spiderJobs.platforms.job51.client import Job51Client, create_client
|
||||
from spiderJobs.runner.company_loop import run_company_loop
|
||||
|
||||
|
||||
def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFetcher:
|
||||
"""创建 51job 公司详情 fetcher"""
|
||||
return GetCompanyInfo(company_id=company_id, client=http_client)
|
||||
|
||||
|
||||
def main():
|
||||
client_kwargs = {}
|
||||
|
||||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
||||
if tunnel:
|
||||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
||||
username = os.environ.get("PROXY_USERNAME", "")
|
||||
password = os.environ.get("PROXY_PASSWORD", "")
|
||||
if username and password:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
||||
else:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
||||
|
||||
run_company_loop(
|
||||
platform="qcwy",
|
||||
create_company_fetcher=create_company_fetcher,
|
||||
create_client_fn=create_client,
|
||||
client_kwargs=client_kwargs,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
0
spiderJobs/platforms/zhilian/__init__.py
Normal file
0
spiderJobs/platforms/zhilian/__init__.py
Normal file
53
spiderJobs/platforms/zhilian/company_main.py
Normal file
53
spiderJobs/platforms/zhilian/company_main.py
Normal file
@ -0,0 +1,53 @@
|
||||
"""
|
||||
智联招聘 独立公司爬虫入口
|
||||
|
||||
从 pending_company 队列获取待爬取的智联公司,
|
||||
逐个调用 GetCompanyDetail 获取详情并上传。
|
||||
|
||||
启动:
|
||||
python -m spiderJobs.platforms.zhilian.company_main
|
||||
|
||||
环境变量:
|
||||
API_BASE_URL 后端地址 (默认 http://124.222.106.226:9999)
|
||||
COMPANY_BATCH_SIZE 每批获取公司数 (默认 10)
|
||||
SLEEP_MIN_SECONDS 最小延迟秒数 (默认 10)
|
||||
SLEEP_MAX_SECONDS 最大延迟秒数 (默认 20)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
_project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..", ".."))
|
||||
if _project_root not in sys.path:
|
||||
sys.path.insert(0, _project_root)
|
||||
|
||||
from spiderJobs.core.base import BaseFetcher
|
||||
from spiderJobs.platforms.zhilian.api import GetCompanyDetail
|
||||
from spiderJobs.platforms.zhilian.client import ZhilianClient, create_cgate_client
|
||||
from spiderJobs.runner.company_loop import run_company_loop
|
||||
|
||||
|
||||
def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseFetcher:
|
||||
"""创建智联公司详情 fetcher"""
|
||||
return GetCompanyDetail(number=company_id, client=http_client)
|
||||
|
||||
|
||||
def main():
|
||||
client_kwargs = {}
|
||||
|
||||
proxy = os.environ.get("PROXY_URL", "")
|
||||
if proxy:
|
||||
client_kwargs["proxy"] = proxy
|
||||
|
||||
run_company_loop(
|
||||
platform="zhilian",
|
||||
create_company_fetcher=create_company_fetcher,
|
||||
create_client_fn=create_cgate_client,
|
||||
client_kwargs=client_kwargs,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
35477
spiderJobs/platforms/zhilian/jobs.json
Normal file
35477
spiderJobs/platforms/zhilian/jobs.json
Normal file
File diff suppressed because one or more lines are too long
4
spiderJobs/runner/__init__.py
Normal file
4
spiderJobs/runner/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from spiderJobs.runner.api_client import RunnerAPIClient
|
||||
from spiderJobs.runner.company_loop import run_company_loop
|
||||
|
||||
__all__ = ["RunnerAPIClient", "run_company_loop"]
|
||||
214
spiderJobs/runner/api_client.py
Normal file
214
spiderJobs/runner/api_client.py
Normal file
@ -0,0 +1,214 @@
|
||||
"""
|
||||
runner.api_client - 爬虫与后端 API 的通信层
|
||||
|
||||
提供关键词获取、进度汇报、数据上传等功能。
|
||||
爬虫主循环通过此模块与后端交互,实现状态管理。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from typing import Any, Optional
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class RunnerAPIClient:
|
||||
"""后端 API 客户端,负责关键词调度与数据上传"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_url: str = "",
|
||||
api_token: str = "dev",
|
||||
platform: str = "",
|
||||
crawler_id: str = "",
|
||||
):
|
||||
self.base_url = (
|
||||
base_url
|
||||
or os.environ.get("API_BASE_URL", "http://127.0.0.1:9999")
|
||||
).rstrip("/")
|
||||
self.api_token = api_token or os.environ.get("API_TOKEN", "dev")
|
||||
self.platform = platform
|
||||
self.crawler_id = crawler_id or f"{platform}-{uuid.uuid4().hex[:8]}"
|
||||
self._session = requests.Session()
|
||||
self._session.headers.update({"token": self.api_token})
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 关键词调度
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
def fetch_keyword(self, limit: int = 1) -> list[dict]:
|
||||
"""从后端获取可用关键词(自动原子锁定为 crawling 状态)
|
||||
|
||||
返回关键词列表,每个元素包含:
|
||||
id, city, job, last_completed_page, crawl_status
|
||||
"""
|
||||
resp = self._get(
|
||||
"/api/v1/keyword/available",
|
||||
params={
|
||||
"source": self.platform,
|
||||
"limit": limit,
|
||||
"reserve": "true",
|
||||
"crawler_id": self.crawler_id,
|
||||
},
|
||||
)
|
||||
print(resp)
|
||||
if resp and resp.get("code") == 200:
|
||||
return resp.get("data", {}).get("items", [])
|
||||
return []
|
||||
|
||||
def report_page_progress(
|
||||
self,
|
||||
keyword_id: int,
|
||||
page: int,
|
||||
total_pages: int = 0,
|
||||
jobs_found: int = 0,
|
||||
) -> dict:
|
||||
"""汇报单页爬取进度"""
|
||||
return self._post(
|
||||
"/api/v1/keyword/page-progress",
|
||||
body={
|
||||
"source": self.platform,
|
||||
"keyword_id": keyword_id,
|
||||
"page": page,
|
||||
"total_pages": total_pages,
|
||||
"jobs_found": jobs_found,
|
||||
},
|
||||
)
|
||||
|
||||
def report_crawl_complete(
|
||||
self,
|
||||
keyword_id: int,
|
||||
status: str = "completed",
|
||||
error_message: str = "",
|
||||
) -> dict:
|
||||
"""汇报爬取完成或失败"""
|
||||
return self._post(
|
||||
"/api/v1/keyword/crawl-complete",
|
||||
body={
|
||||
"source": self.platform,
|
||||
"keyword_id": keyword_id,
|
||||
"status": status,
|
||||
"error_message": error_message,
|
||||
},
|
||||
)
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 数据上传
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
def upload_data(
|
||||
self,
|
||||
data_list: list[dict],
|
||||
data_type: str = "job",
|
||||
channel: str = "mini",
|
||||
) -> dict:
|
||||
"""批量上传数据到后端(异步入库)"""
|
||||
if not data_list:
|
||||
return {"code": 200, "message": "空数据跳过"}
|
||||
print(
|
||||
f"[上报] {self.platform}/{data_type} | "
|
||||
f"条数={len(data_list)} | channel={channel} | "
|
||||
f"目标={self.base_url}/api/v1/universal/data/batch-store-async"
|
||||
)
|
||||
resp = self._post(
|
||||
"/api/v1/universal/data/batch-store-async",
|
||||
body={
|
||||
"data_list": data_list,
|
||||
"data_type": data_type,
|
||||
"platform": self.platform,
|
||||
"channel": channel,
|
||||
},
|
||||
)
|
||||
code = resp.get("code", "?")
|
||||
msg = resp.get("msg") or resp.get("message", "")
|
||||
stored = resp.get("data", {}).get("stored", "") if isinstance(resp.get("data"), dict) else ""
|
||||
print(f"[上报] 响应: code={code} msg={msg} {f'stored={stored}' if stored else ''}")
|
||||
return resp
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# Token 管理(Boss 平台需要)
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
def fetch_token(self) -> Optional[dict]:
|
||||
"""获取可用的平台 Token"""
|
||||
resp = self._get(
|
||||
"/api/v1/token/tokens",
|
||||
params={"platform": self.platform},
|
||||
)
|
||||
if resp and resp.get("code") == 200:
|
||||
tokens = resp.get("data", [])
|
||||
return tokens[0] if tokens else None
|
||||
return None
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# 公司队列
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
def fetch_pending_companies(
|
||||
self,
|
||||
limit: int = 10,
|
||||
status: str = "pending",
|
||||
) -> list[dict]:
|
||||
"""从后端获取待爬取公司列表
|
||||
|
||||
返回列表,每个元素包含:
|
||||
source, company_id, company_name, status, error_msg
|
||||
"""
|
||||
resp = self._get(
|
||||
"/api/v1/cleaning/companies",
|
||||
params={
|
||||
"source": self.platform,
|
||||
"status": status,
|
||||
"page_size": limit,
|
||||
},
|
||||
)
|
||||
if resp and resp.get("code") == 200:
|
||||
return resp.get("data", [])
|
||||
return []
|
||||
|
||||
def update_company_status(
|
||||
self,
|
||||
company_id: str,
|
||||
status: str = "done",
|
||||
error_message: str = "",
|
||||
) -> dict:
|
||||
"""更新公司爬取状态(done/failed)"""
|
||||
return self._post(
|
||||
"/api/v1/cleaning/update-company-status",
|
||||
body={
|
||||
"source": self.platform,
|
||||
"company_id": company_id,
|
||||
"status": status,
|
||||
"error_message": error_message,
|
||||
},
|
||||
)
|
||||
|
||||
# ─────────────────────────────────────────────
|
||||
# HTTP 底层
|
||||
# ─────────────────────────────────────────────
|
||||
|
||||
def _get(self, path: str, params: dict | None = None) -> dict:
|
||||
url = f"{self.base_url}{path}"
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = self._session.get(url, params=params, timeout=15)
|
||||
return resp.json()
|
||||
except Exception as e:
|
||||
print(f"[API] GET {path} 第{attempt + 1}次失败: {e}")
|
||||
time.sleep(2 * (attempt + 1))
|
||||
return {}
|
||||
|
||||
def _post(self, path: str, body: dict) -> dict:
|
||||
url = f"{self.base_url}{path}"
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = self._session.post(url, json=body, timeout=30)
|
||||
return resp.json()
|
||||
except Exception as e:
|
||||
print(f"[API] POST {path} 第{attempt + 1}次失败: {e}")
|
||||
time.sleep(2 * (attempt + 1))
|
||||
return {}
|
||||
121
spiderJobs/runner/company_loop.py
Normal file
121
spiderJobs/runner/company_loop.py
Normal file
@ -0,0 +1,121 @@
|
||||
"""
|
||||
runner.company_loop - 独立公司爬虫主循环
|
||||
|
||||
从后端 pending_company 队列获取待爬取公司,
|
||||
逐个调用平台 API 获取公司详情并上传。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from typing import Any, Callable
|
||||
|
||||
from spiderJobs.core.base import BaseFetcher
|
||||
from spiderJobs.runner.api_client import RunnerAPIClient
|
||||
from spiderJobs.runner.loop import sleep_random
|
||||
|
||||
|
||||
def run_company_loop(
|
||||
*,
|
||||
platform: str,
|
||||
create_company_fetcher: Callable[[str, Any], BaseFetcher],
|
||||
create_client_fn: Callable[..., Any],
|
||||
batch_size: int = 10,
|
||||
sleep_min: float = 10,
|
||||
sleep_max: float = 20,
|
||||
api_base_url: str = "",
|
||||
client_kwargs: dict | None = None,
|
||||
) -> None:
|
||||
"""独立公司爬虫主循环
|
||||
|
||||
Args:
|
||||
platform: 平台标识 (boss/qcwy/zhilian)
|
||||
create_company_fetcher: 工厂函数 (company_id, http_client) -> BaseFetcher
|
||||
create_client_fn: 平台 HTTP client 工厂
|
||||
batch_size: 每批获取待处理公司数量
|
||||
sleep_min/max: 请求间随机延迟范围(秒)
|
||||
api_base_url: 后端 API 地址
|
||||
client_kwargs: 传给 create_client_fn 的额外参数
|
||||
"""
|
||||
batch_size = int(os.environ.get("COMPANY_BATCH_SIZE", str(batch_size)))
|
||||
sleep_min = float(os.environ.get("SLEEP_MIN_SECONDS", str(sleep_min)))
|
||||
sleep_max = float(os.environ.get("SLEEP_MAX_SECONDS", str(sleep_max)))
|
||||
|
||||
api = RunnerAPIClient(
|
||||
base_url=api_base_url,
|
||||
platform=platform,
|
||||
)
|
||||
|
||||
print(f"[{platform}-company] 公司爬虫启动 | crawler_id={api.crawler_id}")
|
||||
print(f"[{platform}-company] API: {api.base_url} | batch={batch_size} | delay={sleep_min}-{sleep_max}s")
|
||||
|
||||
http_client = create_client_fn(**(client_kwargs or {}))
|
||||
|
||||
while True:
|
||||
try:
|
||||
# 1. 获取待爬取公司列表
|
||||
companies = api.fetch_pending_companies(limit=batch_size, status="pending")
|
||||
if not companies:
|
||||
print(f"[{platform}-company] 无待处理公司,等待 120s ...")
|
||||
time.sleep(120)
|
||||
continue
|
||||
|
||||
print(f"\n[{platform}-company] 获取到 {len(companies)} 个待处理公司")
|
||||
|
||||
# 2. 逐个爬取
|
||||
success_count = 0
|
||||
fail_count = 0
|
||||
|
||||
for company in companies:
|
||||
company_id = company.get("company_id", "")
|
||||
company_name = company.get("company_name", "")
|
||||
|
||||
if not company_id:
|
||||
continue
|
||||
|
||||
sleep_random(sleep_min, sleep_max)
|
||||
|
||||
try:
|
||||
fetcher = create_company_fetcher(company_id, http_client)
|
||||
result = fetcher.fetch()
|
||||
|
||||
if result.success and result.data:
|
||||
# 上传公司数据
|
||||
data_to_upload = result.data if isinstance(result.data, dict) else {"raw": result.data}
|
||||
api.upload_data([data_to_upload], data_type="company")
|
||||
|
||||
# 标记完成
|
||||
api.update_company_status(company_id, status="done")
|
||||
success_count += 1
|
||||
print(f" [OK] {company_name or company_id}")
|
||||
else:
|
||||
api.update_company_status(
|
||||
company_id,
|
||||
status="failed",
|
||||
error_message=result.error or "empty data",
|
||||
)
|
||||
fail_count += 1
|
||||
print(f" [FAIL] {company_name or company_id}: {result.error}")
|
||||
|
||||
except Exception as e:
|
||||
api.update_company_status(
|
||||
company_id,
|
||||
status="failed",
|
||||
error_message=str(e)[:500],
|
||||
)
|
||||
fail_count += 1
|
||||
print(f" [ERROR] {company_name or company_id}: {e}")
|
||||
|
||||
print(
|
||||
f"[{platform}-company] 批次完成: 成功={success_count} 失败={fail_count}"
|
||||
)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print(f"\n[{platform}-company] 收到中断信号,退出...")
|
||||
break
|
||||
except Exception as e:
|
||||
print(f"[{platform}-company] 主循环异常: {e}")
|
||||
traceback.print_exc()
|
||||
time.sleep(30)
|
||||
57
tests/test_company_jobs_sync.py
Normal file
57
tests/test_company_jobs_sync.py
Normal file
@ -0,0 +1,57 @@
|
||||
import unittest
|
||||
|
||||
from app.services.company_jobs_sync import CompanyJobsSyncService
|
||||
|
||||
|
||||
class CompanyJobsSyncServiceTests(unittest.TestCase):
|
||||
def test_extract_boss_jobs(self):
|
||||
payload = {
|
||||
"zpData": {
|
||||
"jobList": [
|
||||
{"encryptJobId": "job-1"},
|
||||
{"encryptJobId": "job-2"},
|
||||
]
|
||||
}
|
||||
}
|
||||
jobs = CompanyJobsSyncService._extract_boss_jobs(payload)
|
||||
self.assertEqual(len(jobs), 2)
|
||||
|
||||
def test_extract_qcwy_jobs(self):
|
||||
payload = {
|
||||
"resultbody": {
|
||||
"job": {
|
||||
"items": [
|
||||
{"jobId": "1001"},
|
||||
{"jobId": "1002"},
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
jobs = CompanyJobsSyncService._extract_qcwy_jobs(payload)
|
||||
self.assertEqual(len(jobs), 2)
|
||||
|
||||
def test_extract_zhilian_jobs(self):
|
||||
payload = {
|
||||
"data": {
|
||||
"list": [
|
||||
{"number": "zl-1"},
|
||||
{"number": "zl-2"},
|
||||
]
|
||||
}
|
||||
}
|
||||
jobs = CompanyJobsSyncService._extract_zhilian_jobs(payload)
|
||||
self.assertEqual(len(jobs), 2)
|
||||
|
||||
def test_extract_zhilian_jobs_from_top_level_list(self):
|
||||
payload = {
|
||||
"list": [
|
||||
{"number": "zl-top-1"},
|
||||
{"number": "zl-top-2"},
|
||||
]
|
||||
}
|
||||
jobs = CompanyJobsSyncService._extract_zhilian_jobs(payload)
|
||||
self.assertEqual(len(jobs), 2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
87
tests/test_company_storage.py
Normal file
87
tests/test_company_storage.py
Normal file
@ -0,0 +1,87 @@
|
||||
import unittest
|
||||
|
||||
from app.services.company_storage import extract_company_fields, normalize_company_id
|
||||
|
||||
|
||||
class CompanyStorageTests(unittest.TestCase):
|
||||
def test_normalize_qcwy_company_id(self):
|
||||
self.assertEqual(normalize_company_id("qcwy", "co123"), "123")
|
||||
self.assertEqual(normalize_company_id("qcwy", "123"), "123")
|
||||
self.assertEqual(normalize_company_id("boss", "co123"), "co123")
|
||||
|
||||
def test_extract_boss_fields(self):
|
||||
payload = {
|
||||
"zpData": {
|
||||
"brandComInfoVO": {
|
||||
"encryptBrandId": "boss-1",
|
||||
"brandName": "Boss公司",
|
||||
"industryName": "互联网",
|
||||
"scaleName": "100-499人",
|
||||
"stageName": "B轮",
|
||||
"logo": "https://example.com/logo.png",
|
||||
"introduce": "品牌简介",
|
||||
},
|
||||
"companyFullInfoVO": {
|
||||
"name": "Boss公司",
|
||||
"typeName": "民营",
|
||||
"cityName": "上海",
|
||||
"address": "上海市徐汇区",
|
||||
"website": "https://boss.example.com",
|
||||
},
|
||||
}
|
||||
}
|
||||
result = extract_company_fields("boss", payload, "boss-1")
|
||||
self.assertEqual(result["source_company_id"], "boss-1")
|
||||
self.assertEqual(result["company_name"], "Boss公司")
|
||||
self.assertEqual(result["industry"], "互联网")
|
||||
self.assertEqual(result["financing_stage"], "B轮")
|
||||
|
||||
def test_extract_qcwy_fields(self):
|
||||
payload = {
|
||||
"coinfo": {
|
||||
"coid": "123",
|
||||
"coname": "前程公司",
|
||||
"cotype": "民营",
|
||||
"indtype1": "制造业",
|
||||
"cosize": "500-999人",
|
||||
"areaString": "广州",
|
||||
"caddr": "广州市天河区",
|
||||
"webUrl": "https://qcwy.example.com",
|
||||
"logourl": "https://qcwy.example.com/logo.png",
|
||||
"coinfo": "公司简介",
|
||||
},
|
||||
"financingStage": {"name": "未融资"},
|
||||
}
|
||||
result = extract_company_fields("qcwy", payload, "co123")
|
||||
self.assertEqual(result["source_company_id"], "123")
|
||||
self.assertEqual(result["company_name"], "前程公司")
|
||||
self.assertEqual(result["company_size"], "500-999人")
|
||||
self.assertEqual(result["website"], "https://qcwy.example.com")
|
||||
|
||||
def test_extract_zhilian_fields(self):
|
||||
payload = {
|
||||
"data": {
|
||||
"companyBase": {
|
||||
"companyNumber": "zl-1",
|
||||
"companyName": "智联公司",
|
||||
"companyTypeName": "上市公司",
|
||||
"industryName": "教育",
|
||||
"companySize": "1000-9999人",
|
||||
"financingStage": {"name": "已上市"},
|
||||
"cityName": "北京",
|
||||
"address": "北京市海淀区",
|
||||
"companyUrl": "https://zl.example.com",
|
||||
"logoUrl": "https://zl.example.com/logo.png",
|
||||
"companyDescWithHtml": "<p>公司简介</p>",
|
||||
}
|
||||
}
|
||||
}
|
||||
result = extract_company_fields("zhilian", payload, "zl-1")
|
||||
self.assertEqual(result["source_company_id"], "zl-1")
|
||||
self.assertEqual(result["company_name"], "智联公司")
|
||||
self.assertEqual(result["company_type"], "上市公司")
|
||||
self.assertEqual(result["description"], "<p>公司简介</p>")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
174
web/CLAUDE.md
Normal file
174
web/CLAUDE.md
Normal file
@ -0,0 +1,174 @@
|
||||
[根目录](../CLAUDE.md) > **web**
|
||||
|
||||
# web - Vue3 前端模块
|
||||
|
||||
## 模块职责
|
||||
|
||||
基于 Vue 3 + Naive UI 构建的管理后台,提供:用户/角色/权限/菜单/部门的系统管理,三平台招聘数据的浏览与搜索,数据采集趋势与来源分布的统计看板(ECharts),定向数据清洗操作,关键词与代理 IP 管理。
|
||||
|
||||
---
|
||||
|
||||
## 入口与启动
|
||||
|
||||
| 文件 | 说明 |
|
||||
|------|------|
|
||||
| `web/src/main.js` | Vue 应用入口,依次初始化 Store、Router、指令、i18n,挂载到 `#app` |
|
||||
| `web/src/App.vue` | 根组件 |
|
||||
| `web/src/router/index.js` | Vue Router 配置,含路由守卫(认证、页面 Loading、标题) |
|
||||
| `web/src/store/index.js` | Pinia Store 入口 |
|
||||
|
||||
### 本地开发
|
||||
|
||||
```bash
|
||||
cd web
|
||||
pnpm install
|
||||
pnpm dev # Vite dev server,默认 http://localhost:5173
|
||||
pnpm build # 构建到 web/dist/
|
||||
pnpm lint # ESLint 检查
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 页面路由结构
|
||||
|
||||
| 路由路径 | 视图文件 | 说明 |
|
||||
|----------|----------|------|
|
||||
| `/login` | `views/login/index.vue` | 登录页 |
|
||||
| `/analytics` | `views/analytics/index.vue` | 数据分析看板(ECharts 趋势 + 来源饼图) |
|
||||
| `/recruitment/qcwy` | `views/recruitment/qcwy/index.vue` | 前程无忧数据浏览 |
|
||||
| `/recruitment/zhilian` | `views/recruitment/zhilian/index.vue` | 智联招聘数据浏览 |
|
||||
| `/recruitment/boss` | `views/recruitment/boss/index.vue` | Boss 直聘数据浏览 |
|
||||
| `/cleaning/targeted` | `views/cleaning/index.vue` | 定向数据清洗 |
|
||||
| `/cleaning/monitor` | `views/cleaning/monitor.vue` | 清洗任务监控 |
|
||||
| `/keyword` | `views/keyword/index.vue` | 关键词管理 |
|
||||
| `/profile` | `views/profile/index.vue` | 个人中心 |
|
||||
| `/system/user` | `views/system/user/index.vue` | 用户管理 |
|
||||
| `/system/role` | `views/system/role/index.vue` | 角色管理 |
|
||||
| `/system/menu` | `views/system/menu/index.vue` | 菜单管理 |
|
||||
| `/system/api` | `views/system/api/index.vue` | API 权限管理 |
|
||||
| `/system/dept` | `views/system/dept/index.vue` | 部门管理 |
|
||||
| `/system/auditlog` | `views/system/auditlog/index.vue` | 审计日志 |
|
||||
| `/system/proxy` | `views/system/proxy/index.vue` | 代理 IP 管理 |
|
||||
| `/system/token` | `views/system/token/index.vue` | Boss Token 管理 |
|
||||
|
||||
---
|
||||
|
||||
## 对外接口(API 调用层)
|
||||
|
||||
前端 API 模块位于 `web/src/api/`,通过 axios 封装:
|
||||
|
||||
| 文件 | 接口对象 | 说明 |
|
||||
|------|----------|------|
|
||||
| `api/index.js` | 所有系统接口 | 用户、角色、菜单、部门、API 管理等 |
|
||||
| `api/analytics.js` | `getOverview` / `getVolumeTrend` / `getSourceDistribution` | 数据分析统计接口 |
|
||||
| `api/keyword.js` | keyword CRUD | 关键词管理 |
|
||||
| `api/proxy.js` | proxy CRUD | 代理 IP 管理 |
|
||||
| `api/token.js` | token CRUD | Boss Token 管理 |
|
||||
|
||||
HTTP 工具层:`web/src/utils/http/`(基于 axios,含拦截器)
|
||||
|
||||
认证:JWT Token 存储于 localStorage,通过请求拦截器自动注入 `Authorization` 头。
|
||||
|
||||
---
|
||||
|
||||
## 关键依赖与配置
|
||||
|
||||
```json
|
||||
{
|
||||
"主框架": "vue@^3.3.4",
|
||||
"UI 库": "naive-ui@^2.34.4",
|
||||
"状态管理": "pinia@^2.1.6",
|
||||
"路由": "vue-router@^4.2.4",
|
||||
"图表": "echarts@^6.0.0",
|
||||
"HTTP": "axios@^1.4.0",
|
||||
"图标": "@iconify/vue + @iconify/json",
|
||||
"工具": "@vueuse/core, lodash-es, dayjs",
|
||||
"构建": "vite@^4.4.6",
|
||||
"原子 CSS": "unocss@^66.5.10"
|
||||
}
|
||||
```
|
||||
|
||||
Vite 配置:`web/vite.config.js`(含 `@vitejs/plugin-vue`, `unocss`, `unplugin-icons`)。
|
||||
|
||||
---
|
||||
|
||||
## 数据模型(前端状态)
|
||||
|
||||
Pinia Store 模块(`web/src/store/modules/`):
|
||||
|
||||
| Store 模块 | 说明 |
|
||||
|------------|------|
|
||||
| `user` | 当前登录用户信息、Token |
|
||||
| `permission` | 动态路由权限(菜单权限列表) |
|
||||
| `app` | 全局 UI 状态(侧边栏折叠、主题等) |
|
||||
| `tags` | 多页签(Keep-Alive 页签管理) |
|
||||
|
||||
---
|
||||
|
||||
## 数据分析看板关键实现
|
||||
|
||||
`web/src/views/analytics/index.vue` 使用 ECharts 渲染:
|
||||
- **趋势折线图**:按 hour/day/week/month 粒度,分 boss/qcwy/zhilian 三条折线,支持 dataZoom 交互
|
||||
- **来源饼图**:环形饼图展示各平台数据占比
|
||||
- **时间预设**:近 24h / 7d / 30d / 90d / 6m / 12m / 全部 / 自定义
|
||||
|
||||
---
|
||||
|
||||
## 测试与质量
|
||||
|
||||
- 当前无自动化测试文件(缺口)。
|
||||
- 代码规范:ESLint (`@zclzone` + `@unocss` 配置) + `prettier`。
|
||||
- 建议补充:Vitest 单元测试(工具函数)和 Playwright E2E 测试(登录、数据查看流程)。
|
||||
|
||||
---
|
||||
|
||||
## 常见问题 (FAQ)
|
||||
|
||||
**Q: 前端无法连接后端 API?**
|
||||
A: 检查 `web/src/utils/http/index.js` 中的 `baseURL` 配置,或在 Vite 配置中设置代理(`vite.config.js` 的 `server.proxy`)。
|
||||
|
||||
**Q: 动态路由/菜单不更新?**
|
||||
A: 登出后重新登录会重新拉取后端菜单树。如果菜单在后端已新增,前端 permission store 会在下次路由守卫拦截时重新请求。
|
||||
|
||||
**Q: 图表不显示?**
|
||||
A: 检查 ClickHouse 是否可达(后端 `/api/v1/analytics/overview` 是否返回数据),以及图表容器 `div` 的高度是否为 0。
|
||||
|
||||
---
|
||||
|
||||
## 相关文件清单
|
||||
|
||||
```
|
||||
web/src/
|
||||
├── main.js # 应用入口
|
||||
├── App.vue # 根组件
|
||||
├── api/ # API 调用层
|
||||
│ ├── analytics.js
|
||||
│ ├── keyword.js
|
||||
│ ├── proxy.js
|
||||
│ └── token.js
|
||||
├── components/ # 公共组件(CrudTable, CrudModal, QueryBar 等)
|
||||
├── layout/ # 布局(侧边栏、顶栏、标签页)
|
||||
├── router/ # 路由配置与守卫
|
||||
├── store/ # Pinia Store
|
||||
├── utils/
|
||||
│ ├── auth/ # JWT Token 工具
|
||||
│ ├── http/ # axios 封装
|
||||
│ └── storage/ # localStorage 封装
|
||||
└── views/
|
||||
├── analytics/ # 数据分析看板
|
||||
├── cleaning/ # 数据清理页面
|
||||
├── keyword/ # 关键词管理
|
||||
├── recruitment/ # 三平台数据浏览
|
||||
│ ├── boss/
|
||||
│ ├── qcwy/
|
||||
│ └── zhilian/
|
||||
└── system/ # 系统管理(用户/角色/菜单/部门/代理/Token/审计)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 变更记录 (Changelog)
|
||||
|
||||
| 日期 | 说明 |
|
||||
|------|------|
|
||||
| 2026-03-20 | 初始化模块文档 |
|
||||
@ -26,7 +26,7 @@ export function createVitePlugins(viteEnv, isBuild) {
|
||||
open: true,
|
||||
gzipSize: true,
|
||||
brotliSize: true,
|
||||
}),
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
|
||||
@ -25,10 +25,10 @@ export default [
|
||||
Icons({
|
||||
compiler: 'vue3',
|
||||
customCollections: {
|
||||
custom: FileSystemIconLoader(customIconPath)
|
||||
custom: FileSystemIconLoader(customIconPath),
|
||||
},
|
||||
scale: 1,
|
||||
defaultClass: 'inline-block'
|
||||
defaultClass: 'inline-block',
|
||||
}),
|
||||
Components({
|
||||
resolvers: [
|
||||
|
||||
1162
web/pnpm-lock.yaml
generated
1162
web/pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
@ -3,10 +3,10 @@ import { request } from '@/utils'
|
||||
export default {
|
||||
// 获取统计总览 (Total Jobs)
|
||||
getOverview: (params) => request.get('/analytics/overview', { params }),
|
||||
|
||||
|
||||
// 获取数据量趋势 (Volume Trend)
|
||||
getVolumeTrend: (params) => request.get('/analytics/trend/volume', { params }),
|
||||
|
||||
|
||||
// 获取数据来源分布 (Source Distribution)
|
||||
getSourceDistribution: (params) => request.get('/analytics/distribution/source', { params }),
|
||||
}
|
||||
|
||||
@ -53,7 +53,7 @@ const isEmpty = computed(() => props.empty && !props.loading && network.value)
|
||||
const showPlaceholder = computed(() => props.loading || isEmpty.value || !network.value)
|
||||
|
||||
const networkErrorDesc = computed(() =>
|
||||
props.showNetworkReload ? `${NETWORK_ERROR_MSG}, 点击重试` : NETWORK_ERROR_MSG,
|
||||
props.showNetworkReload ? `${NETWORK_ERROR_MSG}, 点击重试` : NETWORK_ERROR_MSG
|
||||
)
|
||||
|
||||
function handleReload() {
|
||||
@ -71,7 +71,7 @@ const stopHandle = watch(
|
||||
if (!newValue) {
|
||||
network.value = window.navigator.onLine
|
||||
}
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
onUnmounted(() => {
|
||||
|
||||
@ -28,7 +28,7 @@ watchDebounced(
|
||||
filterIcons()
|
||||
emit('update:value', choosed.value)
|
||||
},
|
||||
{ debounce: 200 },
|
||||
{ debounce: 200 }
|
||||
)
|
||||
</script>
|
||||
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
<header v-if="showHeader" mb-15 min-h-45 flex items-center justify-between px-15>
|
||||
<slot v-if="$slots.header" name="header" />
|
||||
<template v-else>
|
||||
<h2 text-22 font-normal text-hex-333 dark:text-hex-ccc>{{ title || route.meta?.title }}</h2>
|
||||
<h2 text-22 text-hex-333 font-normal dark:text-hex-ccc>{{ title || route.meta?.title }}</h2>
|
||||
<slot name="action" />
|
||||
</template>
|
||||
</header>
|
||||
|
||||
@ -8,8 +8,8 @@
|
||||
max-w-150
|
||||
flex-shrink-0
|
||||
text-16
|
||||
font-bold
|
||||
color-primary
|
||||
font-bold
|
||||
>
|
||||
{{ title }}
|
||||
</h2>
|
||||
|
||||
@ -48,7 +48,7 @@ watch(
|
||||
const title = route.meta?.title
|
||||
tagsStore.addTag({ name, path, title })
|
||||
},
|
||||
{ immediate: true },
|
||||
{ immediate: true }
|
||||
)
|
||||
|
||||
watch(
|
||||
@ -60,7 +60,7 @@ watch(
|
||||
const { offsetLeft: x, offsetWidth: width } = activeTabElement
|
||||
scrollXRef.value?.handleScroll(x + width, width)
|
||||
},
|
||||
{ immediate: true },
|
||||
{ immediate: true }
|
||||
)
|
||||
|
||||
const handleTagClick = (path) => {
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
<AppPage :show-footer="false">
|
||||
<div flex-1>
|
||||
<!-- 筛选栏 -->
|
||||
<n-card rounded-10 mb-15>
|
||||
<n-card mb-15 rounded-10>
|
||||
<n-space align="center">
|
||||
<n-select
|
||||
v-model:value="query.preset"
|
||||
@ -27,16 +27,16 @@
|
||||
</n-card>
|
||||
|
||||
<!-- 概览数据 -->
|
||||
<n-grid :x-gap="15" :y-gap="15" :cols="4" mb-15>
|
||||
<n-gi>
|
||||
<n-card rounded-10 size="small">
|
||||
<n-statistic label="近选定时间段总量" :value="totalFromSources">
|
||||
<template #prefix>
|
||||
<TheIcon icon="mdi:database" color="#2080f0" :size="24" />
|
||||
</template>
|
||||
</n-statistic>
|
||||
</n-card>
|
||||
</n-gi>
|
||||
<n-grid :x-gap="15" :y-gap="15" :cols="4" mb-15>
|
||||
<n-gi>
|
||||
<n-card rounded-10 size="small">
|
||||
<n-statistic label="近选定时间段总量" :value="totalFromSources">
|
||||
<template #prefix>
|
||||
<TheIcon icon="mdi:database" color="#2080f0" :size="24" />
|
||||
</template>
|
||||
</n-statistic>
|
||||
</n-card>
|
||||
</n-gi>
|
||||
<n-gi>
|
||||
<n-card rounded-10 size="small">
|
||||
<n-statistic label="Boss直聘" :value="getSourceCount('boss')">
|
||||
@ -77,11 +77,11 @@
|
||||
|
||||
<n-grid :x-gap="15" :y-gap="15" :cols="2">
|
||||
<n-gi>
|
||||
<n-card title="数据来源占比" rounded-10>
|
||||
<div ref="sourceChartRef" style="height: 300px"></div>
|
||||
<n-card title="数据来源占比" rounded-10>
|
||||
<div ref="sourceChartRef" style="height: 300px"></div>
|
||||
</n-card>
|
||||
</n-gi>
|
||||
<n-gi>
|
||||
<n-gi>
|
||||
<n-card title="系统状态" rounded-10>
|
||||
<div flex items-center justify-center style="height: 300px; color: #999">
|
||||
<n-result status="success" title="系统运行正常">
|
||||
@ -93,7 +93,6 @@
|
||||
</n-card>
|
||||
</n-gi>
|
||||
</n-grid>
|
||||
|
||||
</div>
|
||||
</AppPage>
|
||||
</template>
|
||||
@ -114,12 +113,14 @@ const dateRange = ref(null)
|
||||
|
||||
const overview = ref({
|
||||
total_jobs: 0,
|
||||
period: {}
|
||||
period: {},
|
||||
})
|
||||
|
||||
const trendData = ref([])
|
||||
const sourceDistribution = ref([])
|
||||
const totalFromSources = computed(() => sourceDistribution.value.reduce((sum, i) => sum + (i.job_count || 0), 0))
|
||||
const totalFromSources = computed(() =>
|
||||
sourceDistribution.value.reduce((sum, i) => sum + (i.job_count || 0), 0)
|
||||
)
|
||||
|
||||
// Charts refs
|
||||
const trendChartRef = ref(null)
|
||||
@ -147,7 +148,7 @@ const rangeOptions = [
|
||||
]
|
||||
|
||||
const getSourceCount = (source) => {
|
||||
const item = sourceDistribution.value.find(i => i.category === source)
|
||||
const item = sourceDistribution.value.find((i) => i.category === source)
|
||||
return item ? item.job_count : 0
|
||||
}
|
||||
|
||||
@ -217,21 +218,20 @@ const handleRefresh = () => {
|
||||
const fetchData = async () => {
|
||||
try {
|
||||
const params = getParams()
|
||||
|
||||
|
||||
// Parallel requests
|
||||
const [overviewRes, trendRes, sourceRes] = await Promise.all([
|
||||
api.getOverview(params),
|
||||
api.getVolumeTrend(params),
|
||||
api.getSourceDistribution(params)
|
||||
api.getSourceDistribution(params),
|
||||
])
|
||||
|
||||
const unwrap = (res) => (res && res.data) ? res.data : res
|
||||
const unwrap = (res) => (res && res.data ? res.data : res)
|
||||
overview.value = unwrap(overviewRes) || { total_jobs: 0, period: {} }
|
||||
trendData.value = unwrap(trendRes) || []
|
||||
sourceDistribution.value = unwrap(sourceRes) || []
|
||||
|
||||
|
||||
renderCharts()
|
||||
|
||||
} catch (error) {
|
||||
console.error(error)
|
||||
message.error('获取数据失败')
|
||||
@ -246,81 +246,78 @@ const renderCharts = () => {
|
||||
const renderTrendChart = () => {
|
||||
if (!trendChartRef.value) return
|
||||
if (!trendChart) trendChart = echarts.init(trendChartRef.value)
|
||||
|
||||
|
||||
// Process data for multi-series line chart
|
||||
const sources = ['boss', 'qcwy', 'zhilian']
|
||||
// Get all unique timestamps
|
||||
const times = [...new Set(trendData.value.map(d => d.time))].sort()
|
||||
|
||||
const series = sources.map(source => {
|
||||
const times = [...new Set(trendData.value.map((d) => d.time))].sort()
|
||||
|
||||
const series = sources.map((source) => {
|
||||
return {
|
||||
name: source === 'boss' ? 'Boss直聘' : source === 'qcwy' ? '前程无忧' : '智联招聘',
|
||||
type: 'line',
|
||||
smooth: true,
|
||||
data: times.map(t => {
|
||||
const item = trendData.value.find(d => d.time === t && d.source === source)
|
||||
data: times.map((t) => {
|
||||
const item = trendData.value.find((d) => d.time === t && d.source === source)
|
||||
return item ? item.count : 0
|
||||
})
|
||||
}),
|
||||
}
|
||||
})
|
||||
|
||||
const option = {
|
||||
tooltip: {
|
||||
trigger: 'axis'
|
||||
trigger: 'axis',
|
||||
},
|
||||
legend: {
|
||||
data: ['Boss直聘', '前程无忧', '智联招聘']
|
||||
data: ['Boss直聘', '前程无忧', '智联招聘'],
|
||||
},
|
||||
dataZoom: [
|
||||
{ type: 'slider', realtime: true },
|
||||
{ type: 'inside' }
|
||||
],
|
||||
dataZoom: [{ type: 'slider', realtime: true }, { type: 'inside' }],
|
||||
grid: {
|
||||
left: '3%',
|
||||
right: '4%',
|
||||
bottom: '3%',
|
||||
containLabel: true
|
||||
containLabel: true,
|
||||
},
|
||||
xAxis: {
|
||||
type: 'category',
|
||||
boundaryGap: false,
|
||||
data: times.map(t => {
|
||||
const d = new Date(t)
|
||||
if (query.value.interval === 'hour') {
|
||||
return `${d.getMonth()+1}-${d.getDate()} ${d.getHours()}:00`
|
||||
}
|
||||
if (query.value.interval === 'week') {
|
||||
const m = d.getMonth()+1
|
||||
const day = d.getDate()
|
||||
return `${d.getFullYear()}-${m}-${day}`
|
||||
}
|
||||
if (query.value.interval === 'month') {
|
||||
const m = d.getMonth()+1
|
||||
return `${d.getFullYear()}-${m}`
|
||||
}
|
||||
return d.toISOString().split('T')[0]
|
||||
})
|
||||
data: times.map((t) => {
|
||||
const d = new Date(t)
|
||||
if (query.value.interval === 'hour') {
|
||||
return `${d.getMonth() + 1}-${d.getDate()} ${d.getHours()}:00`
|
||||
}
|
||||
if (query.value.interval === 'week') {
|
||||
const m = d.getMonth() + 1
|
||||
const day = d.getDate()
|
||||
return `${d.getFullYear()}-${m}-${day}`
|
||||
}
|
||||
if (query.value.interval === 'month') {
|
||||
const m = d.getMonth() + 1
|
||||
return `${d.getFullYear()}-${m}`
|
||||
}
|
||||
return d.toISOString().split('T')[0]
|
||||
}),
|
||||
},
|
||||
yAxis: {
|
||||
type: 'value'
|
||||
type: 'value',
|
||||
},
|
||||
series: series
|
||||
series: series,
|
||||
}
|
||||
|
||||
|
||||
trendChart.setOption(option)
|
||||
}
|
||||
|
||||
const renderSourceChart = () => {
|
||||
if (!sourceChartRef.value) return
|
||||
if (!sourceChart) sourceChart = echarts.init(sourceChartRef.value)
|
||||
|
||||
|
||||
const option = {
|
||||
tooltip: {
|
||||
trigger: 'item'
|
||||
trigger: 'item',
|
||||
},
|
||||
legend: {
|
||||
top: '5%',
|
||||
left: 'center'
|
||||
left: 'center',
|
||||
},
|
||||
series: [
|
||||
{
|
||||
@ -331,30 +328,35 @@ const renderSourceChart = () => {
|
||||
itemStyle: {
|
||||
borderRadius: 10,
|
||||
borderColor: '#fff',
|
||||
borderWidth: 2
|
||||
borderWidth: 2,
|
||||
},
|
||||
label: {
|
||||
show: false,
|
||||
position: 'center'
|
||||
position: 'center',
|
||||
},
|
||||
emphasis: {
|
||||
label: {
|
||||
show: true,
|
||||
fontSize: 20,
|
||||
fontWeight: 'bold'
|
||||
}
|
||||
fontWeight: 'bold',
|
||||
},
|
||||
},
|
||||
labelLine: {
|
||||
show: false
|
||||
show: false,
|
||||
},
|
||||
data: sourceDistribution.value.map(item => ({
|
||||
data: sourceDistribution.value.map((item) => ({
|
||||
value: item.job_count,
|
||||
name: item.category === 'boss' ? 'Boss直聘' : item.category === 'qcwy' ? '前程无忧' : '智联招聘'
|
||||
}))
|
||||
}
|
||||
]
|
||||
name:
|
||||
item.category === 'boss'
|
||||
? 'Boss直聘'
|
||||
: item.category === 'qcwy'
|
||||
? '前程无忧'
|
||||
: '智联招聘',
|
||||
})),
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
sourceChart.setOption(option)
|
||||
}
|
||||
|
||||
|
||||
@ -1,119 +1,126 @@
|
||||
<template>
|
||||
<CommonPage title="定向数据清洗">
|
||||
<div class="h-full flex flex-col">
|
||||
<div class="mb-4 flex justify-between items-center bg-white p-4 rounded shadow-sm dark:bg-gray-800">
|
||||
<n-space>
|
||||
<n-button type="primary" @click="showUploadModal = true">
|
||||
<template #icon><TheIcon icon="mdi:cloud-upload" /></template>
|
||||
上传数据
|
||||
</n-button>
|
||||
<n-popconfirm @positive-click="handleClear">
|
||||
<template #trigger>
|
||||
<n-button type="error">
|
||||
<template #icon><TheIcon icon="mdi:delete-forever" /></template>
|
||||
清空所有
|
||||
</n-button>
|
||||
</template>
|
||||
确定要清空所有清洗任务吗?
|
||||
</n-popconfirm>
|
||||
</n-space>
|
||||
</div>
|
||||
|
||||
<CrudTable
|
||||
ref="$table"
|
||||
v-model:query-items="queryItems"
|
||||
:columns="columns"
|
||||
:get-data="api.getCleaningTasks"
|
||||
>
|
||||
<template #queryBar>
|
||||
<QueryBarItem label="状态" :label-width="60">
|
||||
<n-select
|
||||
v-model:value="queryItems.status"
|
||||
:options="statusOptions"
|
||||
placeholder="状态"
|
||||
clearable
|
||||
style="width: 200px"
|
||||
/>
|
||||
</QueryBarItem>
|
||||
<QueryBarItem label="清洗模式" :label-width="80">
|
||||
<n-select
|
||||
v-model:value="queryItems.clean_type"
|
||||
:options="cleanTypeOptions"
|
||||
placeholder="清洗模式"
|
||||
clearable
|
||||
style="width: 220px"
|
||||
/>
|
||||
</QueryBarItem>
|
||||
<QueryBarItem label="目标" :label-width="50">
|
||||
<n-input
|
||||
v-model:value="queryItems.target"
|
||||
placeholder="搜索目标"
|
||||
clearable
|
||||
@keyup.enter="$table?.handleSearch()"
|
||||
style="width: 260px"
|
||||
/>
|
||||
</QueryBarItem>
|
||||
<div
|
||||
class="mb-4 flex items-center justify-between rounded bg-white p-4 shadow-sm dark:bg-gray-800"
|
||||
>
|
||||
<n-space>
|
||||
<n-button type="primary" @click="showUploadModal = true">
|
||||
<template #icon><TheIcon icon="mdi:cloud-upload" /></template>
|
||||
上传数据
|
||||
</n-button>
|
||||
<n-popconfirm @positive-click="handleClear">
|
||||
<template #trigger>
|
||||
<n-button type="error">
|
||||
<template #icon><TheIcon icon="mdi:delete-forever" /></template>
|
||||
清空所有
|
||||
</n-button>
|
||||
</template>
|
||||
</CrudTable>
|
||||
确定要清空所有清洗任务吗?
|
||||
</n-popconfirm>
|
||||
</n-space>
|
||||
</div>
|
||||
|
||||
<!-- Upload Modal -->
|
||||
<n-modal v-model:show="showUploadModal" preset="card" title="上传待清洗数据" style="width: 600px">
|
||||
<n-form label-placement="left" label-width="100">
|
||||
<n-form-item label="清洗模式">
|
||||
<div class="flex flex-col w-full gap-2">
|
||||
<n-select v-model:value="uploadForm.cleanType" :options="cleanTypeOptions" />
|
||||
<div v-if="currentExample" class="text-xs text-gray-500">
|
||||
<n-a :href="currentExample.url" target="_blank" download class="flex items-center">
|
||||
<TheIcon icon="mdi:download" class="mr-1" />
|
||||
下载{{ currentExample.name }}示例文件
|
||||
</n-a>
|
||||
</div>
|
||||
</div>
|
||||
</n-form-item>
|
||||
<n-form-item label="目标平台">
|
||||
<n-select v-model:value="uploadForm.platform" :options="platformOptions" />
|
||||
</n-form-item>
|
||||
<n-form-item label="代理地址">
|
||||
<div class="flex flex-col w-full gap-2">
|
||||
<n-select
|
||||
v-model:value="selectedProxyId"
|
||||
:options="proxyOptions"
|
||||
placeholder="选择已配置的代理(可选)"
|
||||
clearable
|
||||
@update:value="handleProxySelect"
|
||||
/>
|
||||
<n-input
|
||||
v-model:value="uploadForm.proxy"
|
||||
placeholder="例如 http://user:pass@ip:port,留空则使用默认"
|
||||
/>
|
||||
</div>
|
||||
</n-form-item>
|
||||
<n-form-item label="文件">
|
||||
<n-upload
|
||||
directory-dnd
|
||||
:custom-request="customUploadRequest"
|
||||
:max="1"
|
||||
accept=".txt,.csv,.xlsx"
|
||||
>
|
||||
<n-upload-dragger>
|
||||
<div style="margin-bottom: 12px">
|
||||
<TheIcon icon="mdi:cloud-upload" :size="48" />
|
||||
</div>
|
||||
<n-text style="font-size: 16px">点击或拖动文件上传</n-text>
|
||||
<n-p depth="3" style="margin: 8px 0 0 0">
|
||||
支持 .txt, .csv, .xlsx 文件,每行一个目标<br>
|
||||
请确保文件内容与选择的清洗模式和平台一致
|
||||
</n-p>
|
||||
</n-upload-dragger>
|
||||
</n-upload>
|
||||
</n-form-item>
|
||||
</n-form>
|
||||
</n-modal>
|
||||
|
||||
<!-- Detail Modal -->
|
||||
<n-modal v-model:show="showDetailModal" preset="card" title="结果详情" style="width: 600px">
|
||||
<n-code :code="detailJson" language="json" word-wrap />
|
||||
</n-modal>
|
||||
<CrudTable
|
||||
ref="$table"
|
||||
v-model:query-items="queryItems"
|
||||
:columns="columns"
|
||||
:get-data="api.getCleaningTasks"
|
||||
>
|
||||
<template #queryBar>
|
||||
<QueryBarItem label="状态" :label-width="60">
|
||||
<n-select
|
||||
v-model:value="queryItems.status"
|
||||
:options="statusOptions"
|
||||
placeholder="状态"
|
||||
clearable
|
||||
style="width: 200px"
|
||||
/>
|
||||
</QueryBarItem>
|
||||
<QueryBarItem label="清洗模式" :label-width="80">
|
||||
<n-select
|
||||
v-model:value="queryItems.clean_type"
|
||||
:options="cleanTypeOptions"
|
||||
placeholder="清洗模式"
|
||||
clearable
|
||||
style="width: 220px"
|
||||
/>
|
||||
</QueryBarItem>
|
||||
<QueryBarItem label="目标" :label-width="50">
|
||||
<n-input
|
||||
v-model:value="queryItems.target"
|
||||
placeholder="搜索目标"
|
||||
clearable
|
||||
style="width: 260px"
|
||||
@keyup.enter="$table?.handleSearch()"
|
||||
/>
|
||||
</QueryBarItem>
|
||||
</template>
|
||||
</CrudTable>
|
||||
|
||||
<!-- Upload Modal -->
|
||||
<n-modal
|
||||
v-model:show="showUploadModal"
|
||||
preset="card"
|
||||
title="上传待清洗数据"
|
||||
style="width: 600px"
|
||||
>
|
||||
<n-form label-placement="left" label-width="100">
|
||||
<n-form-item label="清洗模式">
|
||||
<div class="w-full flex flex-col gap-2">
|
||||
<n-select v-model:value="uploadForm.cleanType" :options="cleanTypeOptions" />
|
||||
<div v-if="currentExample" class="text-xs text-gray-500">
|
||||
<n-a :href="currentExample.url" target="_blank" download class="flex items-center">
|
||||
<TheIcon icon="mdi:download" class="mr-1" />
|
||||
下载{{ currentExample.name }}示例文件
|
||||
</n-a>
|
||||
</div>
|
||||
</div>
|
||||
</n-form-item>
|
||||
<n-form-item label="目标平台">
|
||||
<n-select v-model:value="uploadForm.platform" :options="platformOptions" />
|
||||
</n-form-item>
|
||||
<n-form-item label="代理地址">
|
||||
<div class="w-full flex flex-col gap-2">
|
||||
<n-select
|
||||
v-model:value="selectedProxyId"
|
||||
:options="proxyOptions"
|
||||
placeholder="选择已配置的代理(可选)"
|
||||
clearable
|
||||
@update:value="handleProxySelect"
|
||||
/>
|
||||
<n-input
|
||||
v-model:value="uploadForm.proxy"
|
||||
placeholder="例如 http://user:pass@ip:port,留空则使用默认"
|
||||
/>
|
||||
</div>
|
||||
</n-form-item>
|
||||
<n-form-item label="文件">
|
||||
<n-upload
|
||||
directory-dnd
|
||||
:custom-request="customUploadRequest"
|
||||
:max="1"
|
||||
accept=".txt,.csv,.xlsx"
|
||||
>
|
||||
<n-upload-dragger>
|
||||
<div style="margin-bottom: 12px">
|
||||
<TheIcon icon="mdi:cloud-upload" :size="48" />
|
||||
</div>
|
||||
<n-text style="font-size: 16px">点击或拖动文件上传</n-text>
|
||||
<n-p depth="3" style="margin: 8px 0 0 0">
|
||||
支持 .txt, .csv, .xlsx 文件,每行一个目标<br />
|
||||
请确保文件内容与选择的清洗模式和平台一致
|
||||
</n-p>
|
||||
</n-upload-dragger>
|
||||
</n-upload>
|
||||
</n-form-item>
|
||||
</n-form>
|
||||
</n-modal>
|
||||
|
||||
<!-- Detail Modal -->
|
||||
<n-modal v-model:show="showDetailModal" preset="card" title="结果详情" style="width: 600px">
|
||||
<n-code :code="detailJson" language="json" word-wrap />
|
||||
</n-modal>
|
||||
</div>
|
||||
</CommonPage>
|
||||
</template>
|
||||
@ -137,81 +144,81 @@ const showDetailModal = ref(false)
|
||||
const detailJson = ref('')
|
||||
|
||||
const uploadForm = reactive({
|
||||
cleanType: 'auto',
|
||||
platform: 'auto',
|
||||
proxy: ''
|
||||
cleanType: 'auto',
|
||||
platform: 'auto',
|
||||
proxy: '',
|
||||
})
|
||||
const selectedProxyId = ref(null)
|
||||
const allProxies = ref([])
|
||||
|
||||
const proxyOptions = computed(() => {
|
||||
if (!allProxies.value || !allProxies.value.length) return []
|
||||
const platform = uploadForm.platform
|
||||
return allProxies.value
|
||||
.filter((item) => {
|
||||
if (!item.is_active) return false
|
||||
if (platform === 'auto') return true
|
||||
return item.platform === 'all' || item.platform === platform
|
||||
})
|
||||
.map((item) => ({
|
||||
label: `${item.name} (${item.proxy_type.toUpperCase()})`,
|
||||
value: item.id
|
||||
}))
|
||||
if (!allProxies.value || !allProxies.value.length) return []
|
||||
const platform = uploadForm.platform
|
||||
return allProxies.value
|
||||
.filter((item) => {
|
||||
if (!item.is_active) return false
|
||||
if (platform === 'auto') return true
|
||||
return item.platform === 'all' || item.platform === platform
|
||||
})
|
||||
.map((item) => ({
|
||||
label: `${item.name} (${item.proxy_type.toUpperCase()})`,
|
||||
value: item.id,
|
||||
}))
|
||||
})
|
||||
|
||||
const loadProxies = async () => {
|
||||
try {
|
||||
const res = await proxyApi.list({ page: 1, page_size: 100, is_active: true })
|
||||
allProxies.value = res.data || res.data?.data || []
|
||||
} catch (e) {
|
||||
allProxies.value = []
|
||||
}
|
||||
try {
|
||||
const res = await proxyApi.list({ page: 1, page_size: 100, is_active: true })
|
||||
allProxies.value = res.data || res.data?.data || []
|
||||
} catch (e) {
|
||||
allProxies.value = []
|
||||
}
|
||||
}
|
||||
|
||||
const handleProxySelect = (val) => {
|
||||
selectedProxyId.value = val
|
||||
if (!val) return
|
||||
const found = allProxies.value.find((item) => item.id === val)
|
||||
if (found) {
|
||||
uploadForm.proxy = found.proxy_url
|
||||
}
|
||||
selectedProxyId.value = val
|
||||
if (!val) return
|
||||
const found = allProxies.value.find((item) => item.id === val)
|
||||
if (found) {
|
||||
uploadForm.proxy = found.proxy_url
|
||||
}
|
||||
}
|
||||
|
||||
const currentExample = computed(() => {
|
||||
const type = uploadForm.cleanType
|
||||
const platform = uploadForm.platform
|
||||
|
||||
let url = ''
|
||||
let name = ''
|
||||
|
||||
if (type === 'company_name') {
|
||||
name = '公司名称'
|
||||
url = '/static/examples/company_names.txt'
|
||||
} else if (type === 'company_id') {
|
||||
name = '公司ID'
|
||||
url = '/static/examples/company_ids.txt'
|
||||
} else if (type === 'clean_url') {
|
||||
name = 'URL链接'
|
||||
url = '/static/examples/boss_urls.txt'
|
||||
}
|
||||
|
||||
// Platform specific overrides
|
||||
if (platform === 'boss') {
|
||||
if (type === 'company_id') url = '/static/examples/boss_com_ids.txt'
|
||||
if (type === 'company_name') url = '/static/examples/boss_com_names.txt'
|
||||
if (type === 'clean_url') url = '/static/examples/boss_urls.txt'
|
||||
} else if (platform === 'qcwy') {
|
||||
if (type === 'company_id') url = '/static/examples/qcwy_com_ids.txt'
|
||||
if (type === 'company_name') url = '/static/examples/qcwy_com_names.txt'
|
||||
if (type === 'clean_url') url = '/static/examples/qcwy_urls.txt'
|
||||
} else if (platform === 'zhilian') {
|
||||
if (type === 'company_id') url = '/static/examples/zhaopin_com_ids.txt'
|
||||
if (type === 'company_name') url = '/static/examples/zhaopin_com_names.txt'
|
||||
if (type === 'clean_url') url = '/static/examples/zhaopin_urls.txt'
|
||||
}
|
||||
|
||||
if (!url) return null
|
||||
return { name, url }
|
||||
const type = uploadForm.cleanType
|
||||
const platform = uploadForm.platform
|
||||
|
||||
let url = ''
|
||||
let name = ''
|
||||
|
||||
if (type === 'company_name') {
|
||||
name = '公司名称'
|
||||
url = '/static/examples/company_names.txt'
|
||||
} else if (type === 'company_id') {
|
||||
name = '公司ID'
|
||||
url = '/static/examples/company_ids.txt'
|
||||
} else if (type === 'clean_url') {
|
||||
name = 'URL链接'
|
||||
url = '/static/examples/boss_urls.txt'
|
||||
}
|
||||
|
||||
// Platform specific overrides
|
||||
if (platform === 'boss') {
|
||||
if (type === 'company_id') url = '/static/examples/boss_com_ids.txt'
|
||||
if (type === 'company_name') url = '/static/examples/boss_com_names.txt'
|
||||
if (type === 'clean_url') url = '/static/examples/boss_urls.txt'
|
||||
} else if (platform === 'qcwy') {
|
||||
if (type === 'company_id') url = '/static/examples/qcwy_com_ids.txt'
|
||||
if (type === 'company_name') url = '/static/examples/qcwy_com_names.txt'
|
||||
if (type === 'clean_url') url = '/static/examples/qcwy_urls.txt'
|
||||
} else if (platform === 'zhilian') {
|
||||
if (type === 'company_id') url = '/static/examples/zhaopin_com_ids.txt'
|
||||
if (type === 'company_name') url = '/static/examples/zhaopin_com_names.txt'
|
||||
if (type === 'clean_url') url = '/static/examples/zhaopin_urls.txt'
|
||||
}
|
||||
|
||||
if (!url) return null
|
||||
return { name, url }
|
||||
})
|
||||
|
||||
const cleanTypeOptions = [
|
||||
@ -219,207 +226,228 @@ const cleanTypeOptions = [
|
||||
{ label: '公司名称清洗', value: 'company_name' },
|
||||
{ label: '公司ID清洗', value: 'company_id' },
|
||||
{ label: 'URL清洗', value: 'clean_url' },
|
||||
{ label: '公司Jobs清洗', value: 'company_jobs' }
|
||||
{ label: '公司Jobs清洗', value: 'company_jobs' },
|
||||
]
|
||||
|
||||
const platformOptions = [
|
||||
{ label: '自动识别', value: 'auto' },
|
||||
{ label: 'Boss直聘', value: 'boss' },
|
||||
{ label: '前程无忧', value: 'qcwy' },
|
||||
{ label: '智联招聘', value: 'zhilian' }
|
||||
{ label: '智联招聘', value: 'zhilian' },
|
||||
]
|
||||
|
||||
const statusOptions = [
|
||||
{ label: '待处理', value: 'pending' },
|
||||
{ label: '处理中', value: 'processing' },
|
||||
{ label: '成功', value: 'success' },
|
||||
{ label: '失败', value: 'fail' }
|
||||
{ label: '待处理', value: 'pending' },
|
||||
{ label: '处理中', value: 'processing' },
|
||||
{ label: '成功', value: 'success' },
|
||||
{ label: '失败', value: 'fail' },
|
||||
]
|
||||
|
||||
const customUploadRequest = async ({ file, onFinish, onError }) => {
|
||||
const formData = new FormData()
|
||||
formData.append('file', file.file)
|
||||
formData.append('clean_type', uploadForm.cleanType)
|
||||
formData.append('platform', uploadForm.platform)
|
||||
if (uploadForm.proxy) {
|
||||
formData.append('proxy', uploadForm.proxy)
|
||||
}
|
||||
|
||||
try {
|
||||
await api.uploadCleaningFile(formData)
|
||||
message.success('上传成功')
|
||||
showUploadModal.value = false
|
||||
$table.value?.handleSearch()
|
||||
onFinish()
|
||||
} catch (error) {
|
||||
message.error('上传失败')
|
||||
onError()
|
||||
}
|
||||
const formData = new FormData()
|
||||
formData.append('file', file.file)
|
||||
formData.append('clean_type', uploadForm.cleanType)
|
||||
formData.append('platform', uploadForm.platform)
|
||||
if (uploadForm.proxy) {
|
||||
formData.append('proxy', uploadForm.proxy)
|
||||
}
|
||||
|
||||
try {
|
||||
await api.uploadCleaningFile(formData)
|
||||
message.success('上传成功')
|
||||
showUploadModal.value = false
|
||||
$table.value?.handleSearch()
|
||||
onFinish()
|
||||
} catch (error) {
|
||||
message.error('上传失败')
|
||||
onError()
|
||||
}
|
||||
}
|
||||
|
||||
const handleClear = async () => {
|
||||
try {
|
||||
await api.clearCleaningTasks()
|
||||
message.success('已清空')
|
||||
$table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
message.error('操作失败')
|
||||
}
|
||||
try {
|
||||
await api.clearCleaningTasks()
|
||||
message.success('已清空')
|
||||
$table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
message.error('操作失败')
|
||||
}
|
||||
}
|
||||
|
||||
const handleProcess = async (row) => {
|
||||
try {
|
||||
row.processing = true // Optimistic UI update if possible, or just loading state
|
||||
await api.processCleaningTask(row.id)
|
||||
message.success('处理完成')
|
||||
$table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
message.error('处理失败')
|
||||
} finally {
|
||||
row.processing = false
|
||||
}
|
||||
try {
|
||||
row.processing = true // Optimistic UI update if possible, or just loading state
|
||||
await api.processCleaningTask(row.id)
|
||||
message.success('处理完成')
|
||||
$table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
message.error('处理失败')
|
||||
} finally {
|
||||
row.processing = false
|
||||
}
|
||||
}
|
||||
|
||||
const handleDelete = async (row) => {
|
||||
try {
|
||||
await api.deleteCleaningTask({ id: row.id })
|
||||
message.success('删除成功')
|
||||
$table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
message.error('删除失败')
|
||||
}
|
||||
try {
|
||||
await api.deleteCleaningTask({ id: row.id })
|
||||
message.success('删除成功')
|
||||
$table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
message.error('删除失败')
|
||||
}
|
||||
}
|
||||
|
||||
const showDetail = (row) => {
|
||||
try {
|
||||
const displayData = {
|
||||
result: row.result_summary,
|
||||
original_data: row.original_data,
|
||||
error: row.error_msg
|
||||
}
|
||||
// Filter out null/undefined values for cleaner display
|
||||
const cleanData = Object.fromEntries(
|
||||
Object.entries(displayData).filter(([_, v]) => v != null)
|
||||
)
|
||||
|
||||
detailJson.value = JSON.stringify(cleanData, null, 2)
|
||||
showDetailModal.value = true
|
||||
} catch (e) {
|
||||
detailJson.value = '{}'
|
||||
try {
|
||||
const displayData = {
|
||||
result: row.result_summary,
|
||||
original_data: row.original_data,
|
||||
error: row.error_msg,
|
||||
}
|
||||
// Filter out null/undefined values for cleaner display
|
||||
const cleanData = Object.fromEntries(Object.entries(displayData).filter(([_, v]) => v != null))
|
||||
|
||||
detailJson.value = JSON.stringify(cleanData, null, 2)
|
||||
showDetailModal.value = true
|
||||
} catch (e) {
|
||||
detailJson.value = '{}'
|
||||
}
|
||||
}
|
||||
|
||||
const columns = [
|
||||
{ title: 'ID', key: 'id', width: 60, align: 'center' },
|
||||
{
|
||||
title: '平台',
|
||||
key: 'platform',
|
||||
width: 100,
|
||||
render(row) {
|
||||
const map = {
|
||||
'boss': 'Boss直聘',
|
||||
'qcwy': '前程无忧',
|
||||
'zhilian': '智联招聘',
|
||||
'auto': '自动识别'
|
||||
}
|
||||
return map[row.platform] || row.platform
|
||||
}
|
||||
{ title: 'ID', key: 'id', width: 60, align: 'center' },
|
||||
{
|
||||
title: '平台',
|
||||
key: 'platform',
|
||||
width: 100,
|
||||
render(row) {
|
||||
const map = {
|
||||
boss: 'Boss直聘',
|
||||
qcwy: '前程无忧',
|
||||
zhilian: '智联招聘',
|
||||
auto: '自动识别',
|
||||
}
|
||||
return map[row.platform] || row.platform
|
||||
},
|
||||
{ title: '目标', key: 'target', width: 200, ellipsis: { tooltip: true } },
|
||||
{ title: '代理', key: 'proxy', width: 220, ellipsis: { tooltip: true } },
|
||||
{
|
||||
title: '清洗模式',
|
||||
key: 'clean_type',
|
||||
width: 100,
|
||||
render(row) {
|
||||
const opt = cleanTypeOptions.find(o => o.value === row.clean_type)
|
||||
return opt ? opt.label : row.clean_type
|
||||
}
|
||||
},
|
||||
{ title: '目标', key: 'target', width: 200, ellipsis: { tooltip: true } },
|
||||
{ title: '代理', key: 'proxy', width: 220, ellipsis: { tooltip: true } },
|
||||
{
|
||||
title: '清洗模式',
|
||||
key: 'clean_type',
|
||||
width: 100,
|
||||
render(row) {
|
||||
const opt = cleanTypeOptions.find((o) => o.value === row.clean_type)
|
||||
return opt ? opt.label : row.clean_type
|
||||
},
|
||||
{
|
||||
title: '状态',
|
||||
key: 'status',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
const statusMap = {
|
||||
pending: { type: 'default', text: '待处理' },
|
||||
processing: { type: 'info', text: '处理中' },
|
||||
success: { type: 'success', text: '成功' },
|
||||
fail: { type: 'error', text: '失败' }
|
||||
}
|
||||
const s = statusMap[row.status] || { type: 'default', text: row.status }
|
||||
return h(NTag, { type: s.type, bordered: false }, { default: () => s.text })
|
||||
}
|
||||
},
|
||||
{
|
||||
title: '状态',
|
||||
key: 'status',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
const statusMap = {
|
||||
pending: { type: 'default', text: '待处理' },
|
||||
processing: { type: 'info', text: '处理中' },
|
||||
success: { type: 'success', text: '成功' },
|
||||
fail: { type: 'error', text: '失败' },
|
||||
}
|
||||
const s = statusMap[row.status] || { type: 'default', text: row.status }
|
||||
return h(NTag, { type: s.type, bordered: false }, { default: () => s.text })
|
||||
},
|
||||
{
|
||||
title: '存储状态',
|
||||
key: 'storage_status',
|
||||
width: 120,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
const map = {
|
||||
saved: { type: 'success', text: '已入库' },
|
||||
duplicate: { type: 'warning', text: '重复跳过' },
|
||||
failed: { type: 'error', text: '入库失败' },
|
||||
unknown: { type: 'default', text: '-' }
|
||||
}
|
||||
const s = map[row.storage_status] || { type: 'default', text: row.storage_status }
|
||||
return h(NTag, { type: s.type, bordered: false, size: 'small' }, { default: () => s.text })
|
||||
}
|
||||
},
|
||||
{
|
||||
title: '存储状态',
|
||||
key: 'storage_status',
|
||||
width: 120,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
const map = {
|
||||
saved: { type: 'success', text: '已入库' },
|
||||
duplicate: { type: 'warning', text: '重复跳过' },
|
||||
failed: { type: 'error', text: '入库失败' },
|
||||
unknown: { type: 'default', text: '-' },
|
||||
}
|
||||
const s = map[row.storage_status] || { type: 'default', text: row.storage_status }
|
||||
return h(NTag, { type: s.type, bordered: false, size: 'small' }, { default: () => s.text })
|
||||
},
|
||||
{
|
||||
title: '远程推送',
|
||||
key: 'remote_sent',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
if (row.status !== 'success') return '-'
|
||||
return row.remote_sent
|
||||
? h(NTag, { type: 'success', size: 'small' }, { default: () => '已发送' })
|
||||
: h(NTag, { type: 'default', size: 'small' }, { default: () => '未发送' })
|
||||
}
|
||||
},
|
||||
{
|
||||
title: '远程推送',
|
||||
key: 'remote_sent',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
if (row.status !== 'success') return '-'
|
||||
return row.remote_sent
|
||||
? h(NTag, { type: 'success', size: 'small' }, { default: () => '已发送' })
|
||||
: h(NTag, { type: 'default', size: 'small' }, { default: () => '未发送' })
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 200,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NSpace, { justify: 'center' }, {
|
||||
default: () => [
|
||||
h(NButton, {
|
||||
size: 'small',
|
||||
type: 'primary',
|
||||
ghost: true,
|
||||
disabled: row.status === 'processing',
|
||||
onClick: () => handleProcess(row)
|
||||
}, { default: () => row.status === 'success' ? '重试' : '执行' }),
|
||||
|
||||
(row.status === 'success' || row.status === 'fail') ? h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => showDetail(row)
|
||||
}, { default: () => '详情' }) : null,
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 200,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(
|
||||
NSpace,
|
||||
{ justify: 'center' },
|
||||
{
|
||||
default: () => [
|
||||
h(
|
||||
NButton,
|
||||
{
|
||||
size: 'small',
|
||||
type: 'primary',
|
||||
ghost: true,
|
||||
disabled: row.status === 'processing',
|
||||
onClick: () => handleProcess(row),
|
||||
},
|
||||
{ default: () => (row.status === 'success' ? '重试' : '执行') }
|
||||
),
|
||||
|
||||
h(NPopconfirm, {
|
||||
onPositiveClick: () => handleDelete(row)
|
||||
}, {
|
||||
trigger: () => h(NButton, {
|
||||
size: 'small',
|
||||
type: 'error',
|
||||
text: true,
|
||||
}, { default: () => '删除', icon: renderIcon('mdi:delete', { size: 16 }) }),
|
||||
default: () => '确定删除该任务吗?'
|
||||
})
|
||||
]
|
||||
})
|
||||
row.status === 'success' || row.status === 'fail'
|
||||
? h(
|
||||
NButton,
|
||||
{
|
||||
size: 'small',
|
||||
onClick: () => showDetail(row),
|
||||
},
|
||||
{ default: () => '详情' }
|
||||
)
|
||||
: null,
|
||||
|
||||
h(
|
||||
NPopconfirm,
|
||||
{
|
||||
onPositiveClick: () => handleDelete(row),
|
||||
},
|
||||
{
|
||||
trigger: () =>
|
||||
h(
|
||||
NButton,
|
||||
{
|
||||
size: 'small',
|
||||
type: 'error',
|
||||
text: true,
|
||||
},
|
||||
{ default: () => '删除', icon: renderIcon('mdi:delete', { size: 16 }) }
|
||||
),
|
||||
default: () => '确定删除该任务吗?',
|
||||
}
|
||||
),
|
||||
],
|
||||
}
|
||||
}
|
||||
)
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
onMounted(() => {
|
||||
loadProxies()
|
||||
$table.value?.handleSearch()
|
||||
loadProxies()
|
||||
$table.value?.handleSearch()
|
||||
})
|
||||
</script>
|
||||
|
||||
@ -1,4 +1,3 @@
|
||||
|
||||
const Layout = () => import('@/layout/index.vue')
|
||||
|
||||
export default {
|
||||
|
||||
@ -1,6 +1,18 @@
|
||||
<script setup>
|
||||
import { h, ref, watch, onMounted } from 'vue'
|
||||
import { NButton, NInput, NTabs, NTabPane, NPopconfirm, NForm, NFormItem, NGrid, NGridItem, NCard, NStatistic } from 'naive-ui'
|
||||
import {
|
||||
NButton,
|
||||
NInput,
|
||||
NTabs,
|
||||
NTabPane,
|
||||
NPopconfirm,
|
||||
NForm,
|
||||
NFormItem,
|
||||
NGrid,
|
||||
NGridItem,
|
||||
NCard,
|
||||
NStatistic,
|
||||
} from 'naive-ui'
|
||||
import CommonPage from '@/components/page/CommonPage.vue'
|
||||
import QueryBarItem from '@/components/query-bar/QueryBarItem.vue'
|
||||
import CrudModal from '@/components/table/CrudModal.vue'
|
||||
@ -42,22 +54,22 @@ const getList = (params) => {
|
||||
}
|
||||
|
||||
const doCreate = (data) => {
|
||||
return api.create(data, { source: activeTab.value }).then(res => {
|
||||
fetchStats()
|
||||
return res
|
||||
return api.create(data, { source: activeTab.value }).then((res) => {
|
||||
fetchStats()
|
||||
return res
|
||||
})
|
||||
}
|
||||
|
||||
const doUpdate = (data) => {
|
||||
const { id, ...rest } = data
|
||||
return api.update({ id, source: activeTab.value }, rest)
|
||||
const { id, ...rest } = data
|
||||
return api.update({ id, source: activeTab.value }, rest)
|
||||
}
|
||||
|
||||
const doDelete = (data) => {
|
||||
return api.delete({ id: data.id, source: activeTab.value }).then(res => {
|
||||
fetchStats()
|
||||
return res
|
||||
})
|
||||
return api.delete({ id: data.id, source: activeTab.value }).then((res) => {
|
||||
fetchStats()
|
||||
return res
|
||||
})
|
||||
}
|
||||
|
||||
const {
|
||||
@ -88,23 +100,23 @@ const columns = [
|
||||
{ title: 'ID', key: 'id', width: 60, align: 'center' },
|
||||
{ title: '城市', key: 'city', width: 100, align: 'center' },
|
||||
{ title: '职位关键词', key: 'job', width: 150, align: 'center' },
|
||||
{
|
||||
title: '最后请求日期',
|
||||
key: 'last_requested_date',
|
||||
width: 120,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
return row.last_requested_date || '-'
|
||||
}
|
||||
{
|
||||
title: '最后请求日期',
|
||||
key: 'last_requested_date',
|
||||
width: 120,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
return row.last_requested_date || '-'
|
||||
},
|
||||
},
|
||||
{
|
||||
title: '最后请求时间',
|
||||
key: 'last_requested_at',
|
||||
width: 150,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
return row.last_requested_at ? formatDate(row.last_requested_at) : '-'
|
||||
}
|
||||
{
|
||||
title: '最后请求时间',
|
||||
key: 'last_requested_at',
|
||||
width: 150,
|
||||
align: 'center',
|
||||
render(row) {
|
||||
return row.last_requested_at ? formatDate(row.last_requested_at) : '-'
|
||||
},
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
@ -134,7 +146,10 @@ const columns = [
|
||||
h(
|
||||
NButton,
|
||||
{ size: 'small', type: 'error' },
|
||||
{ default: () => '删除', icon: renderIcon('material-symbols:delete-outline', { size: 16 }) }
|
||||
{
|
||||
default: () => '删除',
|
||||
icon: renderIcon('material-symbols:delete-outline', { size: 16 }),
|
||||
}
|
||||
),
|
||||
default: () => '确定删除该关键词吗?',
|
||||
}
|
||||
@ -153,11 +168,11 @@ const columns = [
|
||||
<n-card>
|
||||
<n-statistic label="Boss直聘">
|
||||
<template #prefix>
|
||||
<div class="text-sm font-semibold text-gray-700">已用 / 总数</div>
|
||||
<div class="text-sm text-gray-700 font-semibold">已用 / 总数</div>
|
||||
</template>
|
||||
{{ overviewStats.boss.used }} / {{ overviewStats.boss.total }}
|
||||
<template #suffix>
|
||||
<span class="text-xs text-green-600" v-if="overviewStats.boss.total > 0">
|
||||
<span v-if="overviewStats.boss.total > 0" class="text-xs text-green-600">
|
||||
{{ ((overviewStats.boss.used / overviewStats.boss.total) * 100).toFixed(1) }}%
|
||||
</span>
|
||||
</template>
|
||||
@ -168,11 +183,11 @@ const columns = [
|
||||
<n-card>
|
||||
<n-statistic label="前程无忧">
|
||||
<template #prefix>
|
||||
<div class="text-sm font-semibold text-gray-700">已用 / 总数</div>
|
||||
<div class="text-sm text-gray-700 font-semibold">已用 / 总数</div>
|
||||
</template>
|
||||
{{ overviewStats.qcwy.used }} / {{ overviewStats.qcwy.total }}
|
||||
<template #suffix>
|
||||
<span class="text-xs text-green-600" v-if="overviewStats.qcwy.total > 0">
|
||||
<span v-if="overviewStats.qcwy.total > 0" class="text-xs text-green-600">
|
||||
{{ ((overviewStats.qcwy.used / overviewStats.qcwy.total) * 100).toFixed(1) }}%
|
||||
</span>
|
||||
</template>
|
||||
@ -183,12 +198,14 @@ const columns = [
|
||||
<n-card>
|
||||
<n-statistic label="智联招聘">
|
||||
<template #prefix>
|
||||
<div class="text-sm font-semibold text-gray-700">已用 / 总数</div>
|
||||
<div class="text-sm text-gray-700 font-semibold">已用 / 总数</div>
|
||||
</template>
|
||||
{{ overviewStats.zhilian.used }} / {{ overviewStats.zhilian.total }}
|
||||
<template #suffix>
|
||||
<span class="text-xs text-green-600" v-if="overviewStats.zhilian.total > 0">
|
||||
{{ ((overviewStats.zhilian.used / overviewStats.zhilian.total) * 100).toFixed(1) }}%
|
||||
<span v-if="overviewStats.zhilian.total > 0" class="text-xs text-green-600">
|
||||
{{
|
||||
((overviewStats.zhilian.used / overviewStats.zhilian.total) * 100).toFixed(1)
|
||||
}}%
|
||||
</span>
|
||||
</template>
|
||||
</n-statistic>
|
||||
@ -196,8 +213,8 @@ const columns = [
|
||||
</n-grid-item>
|
||||
</n-grid>
|
||||
|
||||
<div class="bg-white p-4 rounded-lg shadow-sm">
|
||||
<div class="flex justify-between items-center mb-4">
|
||||
<div class="rounded-lg bg-white p-4 shadow-sm">
|
||||
<div class="mb-4 flex items-center justify-between">
|
||||
<NTabs v-model:value="activeTab" type="line" animated>
|
||||
<NTabPane name="boss" tab="Boss直聘" />
|
||||
<NTabPane name="qcwy" tab="前程无忧" />
|
||||
@ -251,10 +268,18 @@ const columns = [
|
||||
:label-width="80"
|
||||
:model="modalForm"
|
||||
>
|
||||
<NFormItem label="城市" path="city" :rule="{ required: true, message: '请输入城市', trigger: ['input', 'blur'] }">
|
||||
<NFormItem
|
||||
label="城市"
|
||||
path="city"
|
||||
:rule="{ required: true, message: '请输入城市', trigger: ['input', 'blur'] }"
|
||||
>
|
||||
<NInput v-model:value="modalForm.city" clearable placeholder="请输入城市" />
|
||||
</NFormItem>
|
||||
<NFormItem label="职位" path="job" :rule="{ required: true, message: '请输入职位', trigger: ['input', 'blur'] }">
|
||||
<NFormItem
|
||||
label="职位"
|
||||
path="job"
|
||||
:rule="{ required: true, message: '请输入职位', trigger: ['input', 'blur'] }"
|
||||
>
|
||||
<NInput v-model:value="modalForm.job" clearable placeholder="请输入职位" />
|
||||
</NFormItem>
|
||||
</NForm>
|
||||
|
||||
@ -1,146 +1,14 @@
|
||||
<template>
|
||||
<CommonPage title="Boss直聘数据">
|
||||
<div class="h-full flex flex-col">
|
||||
<CrudTable
|
||||
ref="$table"
|
||||
v-model:query-items="queryItems"
|
||||
:columns="columns"
|
||||
:get-data="getData"
|
||||
>
|
||||
<template #queryBar>
|
||||
<n-form-item label="数据类型" label-placement="left">
|
||||
<n-select
|
||||
v-model:value="queryItems.data_type"
|
||||
:options="dataTypeOptions"
|
||||
style="width: 200px"
|
||||
@update:value="$table?.handleSearch()"
|
||||
/>
|
||||
</n-form-item>
|
||||
</template>
|
||||
</CrudTable>
|
||||
|
||||
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
|
||||
<n-code :code="detailJson" language="json" word-wrap />
|
||||
</n-modal>
|
||||
</div>
|
||||
</CommonPage>
|
||||
<PlatformData platform="boss" title="Boss直聘数据" :job-columns="jobColumns" />
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, h, onMounted, computed } from 'vue'
|
||||
import { NButton, NTag, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
|
||||
import CommonPage from '@/components/page/CommonPage.vue'
|
||||
import CrudTable from '@/components/table/CrudTable.vue'
|
||||
import api from '@/api'
|
||||
import PlatformData from '../components/PlatformData.vue'
|
||||
|
||||
const $table = ref(null)
|
||||
const queryItems = ref({
|
||||
data_type: 'job',
|
||||
platform: 'boss'
|
||||
})
|
||||
const showDetailModal = ref(false)
|
||||
const detailJson = ref('')
|
||||
|
||||
const dataTypeOptions = [
|
||||
{ label: '职位', value: 'job' },
|
||||
{ label: '公司', value: 'company' }
|
||||
const jobColumns = [
|
||||
{ title: 'ID', key: 'id', width: 100 },
|
||||
{ title: '职位ID', key: 'job_id', width: 200 },
|
||||
{ title: '创建时间', key: 'created_at', width: 200 },
|
||||
{ title: '更新时间', key: 'updated_at', width: 200 },
|
||||
]
|
||||
|
||||
const columns = computed(() => {
|
||||
if (queryItems.value.data_type === 'job') {
|
||||
return [
|
||||
{
|
||||
title: 'ID',
|
||||
key: 'id',
|
||||
width: 100
|
||||
},
|
||||
{
|
||||
title: '职位ID',
|
||||
key: 'job_id',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '创建时间',
|
||||
key: 'created_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '更新时间',
|
||||
key: 'updated_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
// 优先展示原始 json_data,如果没有则展示整个 row
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
}
|
||||
}, { default: () => '详情' })
|
||||
}
|
||||
}
|
||||
]
|
||||
} else {
|
||||
return [
|
||||
{
|
||||
title: 'ID',
|
||||
key: 'id',
|
||||
width: 100
|
||||
},
|
||||
{
|
||||
title: '公司名称',
|
||||
key: 'company_name',
|
||||
width: 250
|
||||
},
|
||||
{
|
||||
title: '创建时间',
|
||||
key: 'created_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '更新时间',
|
||||
key: 'updated_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
}
|
||||
}, { default: () => '详情' })
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
const getData = async (params) => {
|
||||
params.platform = 'boss'
|
||||
const res = await api.queryPlatformData(params)
|
||||
return {
|
||||
data: res.data?.items || [],
|
||||
total: res.data?.total || 0
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
$table.value?.handleSearch()
|
||||
})
|
||||
</script>
|
||||
|
||||
112
web/src/views/recruitment/components/PlatformData.vue
Normal file
112
web/src/views/recruitment/components/PlatformData.vue
Normal file
@ -0,0 +1,112 @@
|
||||
<template>
|
||||
<CommonPage :title="title">
|
||||
<div class="h-full flex flex-col">
|
||||
<CrudTable
|
||||
ref="$table"
|
||||
v-model:query-items="queryItems"
|
||||
:columns="currentColumns"
|
||||
:get-data="getData"
|
||||
>
|
||||
<template #queryBar>
|
||||
<n-form-item label="数据类型" label-placement="left">
|
||||
<n-select
|
||||
v-model:value="queryItems.data_type"
|
||||
:options="dataTypeOptions"
|
||||
style="width: 200px"
|
||||
@update:value="() => $table?.handleSearch()"
|
||||
/>
|
||||
</n-form-item>
|
||||
</template>
|
||||
</CrudTable>
|
||||
|
||||
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
|
||||
<n-code :code="detailJson" language="json" word-wrap />
|
||||
</n-modal>
|
||||
</div>
|
||||
</CommonPage>
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, h, onMounted, computed, defineProps } from 'vue'
|
||||
import { NButton, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
|
||||
import CommonPage from '@/components/page/CommonPage.vue'
|
||||
import CrudTable from '@/components/table/CrudTable.vue'
|
||||
import api from '@/api'
|
||||
|
||||
const props = defineProps({
|
||||
platform: {
|
||||
type: String,
|
||||
required: true,
|
||||
},
|
||||
title: {
|
||||
type: String,
|
||||
required: true,
|
||||
},
|
||||
jobColumns: {
|
||||
type: Array,
|
||||
required: true,
|
||||
},
|
||||
})
|
||||
|
||||
const $table = ref(null)
|
||||
const queryItems = ref({
|
||||
data_type: 'job',
|
||||
platform: props.platform,
|
||||
})
|
||||
const showDetailModal = ref(false)
|
||||
const detailJson = ref('')
|
||||
|
||||
const dataTypeOptions = [
|
||||
{ label: '职位', value: 'job' },
|
||||
{ label: '公司', value: 'company' },
|
||||
]
|
||||
|
||||
const renderActionColumn = () => ({
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(
|
||||
NButton,
|
||||
{
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
},
|
||||
},
|
||||
{ default: () => '详情' }
|
||||
)
|
||||
},
|
||||
})
|
||||
|
||||
const currentColumns = computed(() => {
|
||||
if (queryItems.value.data_type === 'job') {
|
||||
return [...props.jobColumns, renderActionColumn()]
|
||||
} else {
|
||||
return [
|
||||
{ title: 'ID', key: 'id', width: 100 },
|
||||
{ title: '公司名称', key: 'company_name', width: 250 },
|
||||
{ title: '创建时间', key: 'created_at', width: 200 },
|
||||
{ title: '更新时间', key: 'updated_at', width: 200 },
|
||||
renderActionColumn(),
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
const getData = async (params) => {
|
||||
params.platform = props.platform
|
||||
const res = await api.queryPlatformData(params)
|
||||
return {
|
||||
data: res.data?.items || [],
|
||||
total: res.data?.total || 0,
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
$table.value?.handleSearch()
|
||||
})
|
||||
</script>
|
||||
@ -1,151 +1,15 @@
|
||||
<template>
|
||||
<CommonPage title="前程无忧数据">
|
||||
<div class="h-full flex flex-col">
|
||||
<CrudTable
|
||||
ref="$table"
|
||||
v-model:query-items="queryItems"
|
||||
:columns="columns"
|
||||
:get-data="getData"
|
||||
>
|
||||
<template #queryBar>
|
||||
<n-form-item label="数据类型" label-placement="left">
|
||||
<n-select
|
||||
v-model:value="queryItems.data_type"
|
||||
:options="dataTypeOptions"
|
||||
style="width: 200px"
|
||||
@update:value="$table?.handleSearch()"
|
||||
/>
|
||||
</n-form-item>
|
||||
</template>
|
||||
</CrudTable>
|
||||
|
||||
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
|
||||
<n-code :code="detailJson" language="json" word-wrap />
|
||||
</n-modal>
|
||||
</div>
|
||||
</CommonPage>
|
||||
<PlatformData platform="qcwy" title="前程无忧数据" :job-columns="jobColumns" />
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, h, onMounted, computed } from 'vue'
|
||||
import { NButton, NTag, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
|
||||
import CommonPage from '@/components/page/CommonPage.vue'
|
||||
import CrudTable from '@/components/table/CrudTable.vue'
|
||||
import api from '@/api'
|
||||
import PlatformData from '../components/PlatformData.vue'
|
||||
|
||||
const $table = ref(null)
|
||||
const queryItems = ref({
|
||||
data_type: 'job',
|
||||
platform: 'qcwy'
|
||||
})
|
||||
const showDetailModal = ref(false)
|
||||
const detailJson = ref('')
|
||||
|
||||
const dataTypeOptions = [
|
||||
{ label: '职位', value: 'job' },
|
||||
{ label: '公司', value: 'company' }
|
||||
const jobColumns = [
|
||||
{ title: 'ID', key: 'id', width: 100 },
|
||||
{ title: '职位ID', key: 'job_id', width: 200 },
|
||||
{ title: '更新时间', key: 'update_date_time', width: 200 },
|
||||
{ title: '创建时间', key: 'created_at', width: 200 },
|
||||
{ title: '最后更新', key: 'updated_at', width: 200 },
|
||||
]
|
||||
|
||||
const columns = computed(() => {
|
||||
if (queryItems.value.data_type === 'job') {
|
||||
return [
|
||||
{
|
||||
title: 'ID',
|
||||
key: 'id',
|
||||
width: 100
|
||||
},
|
||||
{
|
||||
title: '职位ID',
|
||||
key: 'job_id',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '更新时间',
|
||||
key: 'update_date_time',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '创建时间',
|
||||
key: 'created_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '最后更新',
|
||||
key: 'updated_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
}
|
||||
}, { default: () => '详情' })
|
||||
}
|
||||
}
|
||||
]
|
||||
} else {
|
||||
return [
|
||||
{
|
||||
title: 'ID',
|
||||
key: 'id',
|
||||
width: 100
|
||||
},
|
||||
{
|
||||
title: '公司名称',
|
||||
key: 'company_name',
|
||||
width: 250
|
||||
},
|
||||
{
|
||||
title: '创建时间',
|
||||
key: 'created_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '更新时间',
|
||||
key: 'updated_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
}
|
||||
}, { default: () => '详情' })
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
const getData = async (params) => {
|
||||
// Ensure platform is set
|
||||
params.platform = 'qcwy'
|
||||
const res = await api.queryPlatformData(params)
|
||||
return {
|
||||
data: res.data?.items || [],
|
||||
total: res.data?.total || 0
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
$table.value?.handleSearch()
|
||||
})
|
||||
</script>
|
||||
|
||||
@ -1,150 +1,15 @@
|
||||
<template>
|
||||
<CommonPage title="智联招聘数据">
|
||||
<div class="h-full flex flex-col">
|
||||
<CrudTable
|
||||
ref="$table"
|
||||
v-model:query-items="queryItems"
|
||||
:columns="columns"
|
||||
:get-data="getData"
|
||||
>
|
||||
<template #queryBar>
|
||||
<n-form-item label="数据类型" label-placement="left">
|
||||
<n-select
|
||||
v-model:value="queryItems.data_type"
|
||||
:options="dataTypeOptions"
|
||||
style="width: 200px"
|
||||
@update:value="$table?.handleSearch()"
|
||||
/>
|
||||
</n-form-item>
|
||||
</template>
|
||||
</CrudTable>
|
||||
|
||||
<n-modal v-model:show="showDetailModal" preset="card" title="数据详情" style="width: 800px">
|
||||
<n-code :code="detailJson" language="json" word-wrap />
|
||||
</n-modal>
|
||||
</div>
|
||||
</CommonPage>
|
||||
<PlatformData platform="zhilian" title="智联招聘数据" :job-columns="jobColumns" />
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
import { ref, h, onMounted, computed } from 'vue'
|
||||
import { NButton, NTag, NFormItem, NSelect, NCode, NModal } from 'naive-ui'
|
||||
import CommonPage from '@/components/page/CommonPage.vue'
|
||||
import CrudTable from '@/components/table/CrudTable.vue'
|
||||
import api from '@/api'
|
||||
import PlatformData from '../components/PlatformData.vue'
|
||||
|
||||
const $table = ref(null)
|
||||
const queryItems = ref({
|
||||
data_type: 'job',
|
||||
platform: 'zhilian'
|
||||
})
|
||||
const showDetailModal = ref(false)
|
||||
const detailJson = ref('')
|
||||
|
||||
const dataTypeOptions = [
|
||||
{ label: '职位', value: 'job' },
|
||||
{ label: '公司', value: 'company' }
|
||||
const jobColumns = [
|
||||
{ title: 'ID', key: 'id', width: 100 },
|
||||
{ title: '职位编号', key: 'number', width: 200 },
|
||||
{ title: '发布时间', key: 'first_publish_time', width: 200 },
|
||||
{ title: '创建时间', key: 'created_at', width: 200 },
|
||||
{ title: '更新时间', key: 'updated_at', width: 200 },
|
||||
]
|
||||
|
||||
const columns = computed(() => {
|
||||
if (queryItems.value.data_type === 'job') {
|
||||
return [
|
||||
{
|
||||
title: 'ID',
|
||||
key: 'id',
|
||||
width: 100
|
||||
},
|
||||
{
|
||||
title: '职位编号',
|
||||
key: 'number',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '发布时间',
|
||||
key: 'first_publish_time',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '创建时间',
|
||||
key: 'created_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '更新时间',
|
||||
key: 'updated_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
}
|
||||
}, { default: () => '详情' })
|
||||
}
|
||||
}
|
||||
]
|
||||
} else {
|
||||
return [
|
||||
{
|
||||
title: 'ID',
|
||||
key: 'id',
|
||||
width: 100
|
||||
},
|
||||
{
|
||||
title: '公司名称',
|
||||
key: 'company_name',
|
||||
width: 250
|
||||
},
|
||||
{
|
||||
title: '创建时间',
|
||||
key: 'created_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '更新时间',
|
||||
key: 'updated_at',
|
||||
width: 200
|
||||
},
|
||||
{
|
||||
title: '操作',
|
||||
key: 'actions',
|
||||
width: 100,
|
||||
align: 'center',
|
||||
fixed: 'right',
|
||||
render(row) {
|
||||
return h(NButton, {
|
||||
size: 'small',
|
||||
onClick: () => {
|
||||
const content = row.json_data ? JSON.parse(row.json_data) : row
|
||||
detailJson.value = JSON.stringify(content, null, 2)
|
||||
showDetailModal.value = true
|
||||
}
|
||||
}, { default: () => '详情' })
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
|
||||
const getData = async (params) => {
|
||||
params.platform = 'zhilian'
|
||||
const res = await api.queryPlatformData(params)
|
||||
return {
|
||||
data: res.data?.items || [],
|
||||
total: res.data?.total || 0
|
||||
}
|
||||
}
|
||||
|
||||
onMounted(() => {
|
||||
$table.value?.handleSearch()
|
||||
})
|
||||
</script>
|
||||
|
||||
@ -81,7 +81,7 @@ const methodOptions = [
|
||||
|
||||
function formatJSON(data) {
|
||||
try {
|
||||
return typeof data === 'string'
|
||||
return typeof data === 'string'
|
||||
? JSON.stringify(JSON.parse(data), null, 2)
|
||||
: JSON.stringify(data, null, 2)
|
||||
} catch (e) {
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
ref="tableRef"
|
||||
:columns="columns"
|
||||
:query-items="queryItems"
|
||||
:getData="handleGetList"
|
||||
:get-data="handleGetList"
|
||||
>
|
||||
<template #queryBar>
|
||||
<QueryBarItem label="wt2">
|
||||
@ -24,7 +24,12 @@
|
||||
</footer>
|
||||
</n-card>
|
||||
|
||||
<CrudModal :title="modalTitle" :visible="modalVisible" :loading="modalLoading" @save="() => handleSave(tableRef?.handleSearch)">
|
||||
<CrudModal
|
||||
:title="modalTitle"
|
||||
:visible="modalVisible"
|
||||
:loading="modalLoading"
|
||||
@save="() => handleSave(tableRef?.handleSearch)"
|
||||
>
|
||||
<n-form ref="modalFormRef" :model="modalForm" label-placement="left" label-width="90">
|
||||
<n-form-item label="wt2" path="wt2" :rule="{ required: true, message: '请输入 wt2' }">
|
||||
<n-input v-model:value="modalForm.wt2" />
|
||||
@ -41,7 +46,6 @@
|
||||
</n-form>
|
||||
</CrudModal>
|
||||
</AppPage>
|
||||
|
||||
</template>
|
||||
|
||||
<script setup>
|
||||
@ -57,7 +61,12 @@ const columns = [
|
||||
{ title: 'ID', key: 'id', width: 80 },
|
||||
{ title: 'wt2', key: 'wt2', minWidth: 160 },
|
||||
{ title: 'mpt', key: 'mpt', minWidth: 160 },
|
||||
{ title: '启用', key: 'is_active', width: 90, render: (row) => h('span', row.is_active ? '是' : '否') },
|
||||
{
|
||||
title: '启用',
|
||||
key: 'is_active',
|
||||
width: 90,
|
||||
render: (row) => h('span', row.is_active ? '是' : '否'),
|
||||
},
|
||||
{ title: '失败次数', key: 'failed_count', width: 100 },
|
||||
{ title: '最后使用时间', key: 'last_used_time', minWidth: 180 },
|
||||
{ title: '创建时间', key: 'created_at', minWidth: 180 },
|
||||
@ -67,8 +76,16 @@ const columns = [
|
||||
width: 160,
|
||||
render(row) {
|
||||
return h('div', { style: 'display:flex;gap:8px' }, [
|
||||
h(NButton, { type: 'primary', size: 'small', onClick: () => handleEdit(row) }, { default: () => '编辑' }),
|
||||
h(NButton, { size: 'small', onClick: () => handleDelete(row.id) }, { default: () => '删除' }),
|
||||
h(
|
||||
NButton,
|
||||
{ type: 'primary', size: 'small', onClick: () => handleEdit(row) },
|
||||
{ default: () => '编辑' }
|
||||
),
|
||||
h(
|
||||
NButton,
|
||||
{ size: 'small', onClick: () => handleDelete(row.id) },
|
||||
{ default: () => '删除' }
|
||||
),
|
||||
])
|
||||
},
|
||||
},
|
||||
@ -80,17 +97,26 @@ async function handleGetList(params) {
|
||||
}
|
||||
|
||||
const initForm = { id: null, wt2: '', mpt: '', is_active: true, failed_count: 0 }
|
||||
const { modalVisible, modalTitle, modalLoading, handleAdd, handleDelete, handleEdit, handleSave, modalForm, modalFormRef } = useCRUD({
|
||||
const {
|
||||
modalVisible,
|
||||
modalTitle,
|
||||
modalLoading,
|
||||
handleAdd,
|
||||
handleDelete,
|
||||
handleEdit,
|
||||
handleSave,
|
||||
modalForm,
|
||||
modalFormRef,
|
||||
} = useCRUD({
|
||||
name: 'Boss Token',
|
||||
initForm,
|
||||
doCreate: (data) => tokenApi.create(data),
|
||||
doUpdate: (data) => tokenApi.update(data.id, data),
|
||||
doDelete: (id) => tokenApi.remove(id),
|
||||
refresh: () => tableRef?.handleSearch(),
|
||||
refresh: () => tableRef.value?.handleSearch(),
|
||||
})
|
||||
|
||||
onMounted(() => {
|
||||
tableRef.value?.handleSearch()
|
||||
})
|
||||
|
||||
</script>
|
||||
|
||||
@ -204,40 +204,41 @@ const columns = [
|
||||
default: () => h('div', {}, '确定删除该用户吗?'),
|
||||
}
|
||||
),
|
||||
!row.is_superuser && h(
|
||||
NPopconfirm,
|
||||
{
|
||||
onPositiveClick: async () => {
|
||||
try {
|
||||
await api.resetPassword({ user_id: row.id });
|
||||
$message.success('密码已成功重置为123456');
|
||||
await $table.value?.handleSearch();
|
||||
} catch (error) {
|
||||
$message.error('重置密码失败: ' + error.message);
|
||||
}
|
||||
!row.is_superuser &&
|
||||
h(
|
||||
NPopconfirm,
|
||||
{
|
||||
onPositiveClick: async () => {
|
||||
try {
|
||||
await api.resetPassword({ user_id: row.id })
|
||||
$message.success('密码已成功重置为123456')
|
||||
await $table.value?.handleSearch()
|
||||
} catch (error) {
|
||||
$message.error('重置密码失败: ' + error.message)
|
||||
}
|
||||
},
|
||||
onNegativeClick: () => {},
|
||||
},
|
||||
onNegativeClick: () => {},
|
||||
},
|
||||
{
|
||||
trigger: () =>
|
||||
withDirectives(
|
||||
h(
|
||||
NButton,
|
||||
{
|
||||
size: 'small',
|
||||
type: 'warning',
|
||||
style: 'margin-right: 8px;',
|
||||
},
|
||||
{
|
||||
default: () => '重置密码',
|
||||
icon: renderIcon('material-symbols:lock-reset', { size: 16 }),
|
||||
}
|
||||
{
|
||||
trigger: () =>
|
||||
withDirectives(
|
||||
h(
|
||||
NButton,
|
||||
{
|
||||
size: 'small',
|
||||
type: 'warning',
|
||||
style: 'margin-right: 8px;',
|
||||
},
|
||||
{
|
||||
default: () => '重置密码',
|
||||
icon: renderIcon('material-symbols:lock-reset', { size: 16 }),
|
||||
}
|
||||
),
|
||||
[[vPermission, 'post/api/v1/user/reset_password']]
|
||||
),
|
||||
[[vPermission, 'post/api/v1/user/reset_password']]
|
||||
),
|
||||
default: () => h('div', {}, '确定重置用户密码为123456吗?'),
|
||||
}
|
||||
),
|
||||
default: () => h('div', {}, '确定重置用户密码为123456吗?'),
|
||||
}
|
||||
),
|
||||
]
|
||||
},
|
||||
},
|
||||
@ -363,11 +364,11 @@ const validateAddUser = {
|
||||
<h1>部门列表</h1>
|
||||
<br />
|
||||
<NTree
|
||||
block-line
|
||||
:data="deptOption"
|
||||
key-field="id"
|
||||
label-field="name"
|
||||
default-expand-all
|
||||
block-line
|
||||
:node-props="nodeProps"
|
||||
>
|
||||
</NTree>
|
||||
|
||||
@ -30,7 +30,7 @@ export default defineConfig(({ command, mode }) => {
|
||||
open: true,
|
||||
proxy: VITE_USE_PROXY
|
||||
? {
|
||||
[VITE_BASE_API]: PROXY_CONFIG[VITE_BASE_API]
|
||||
[VITE_BASE_API]: PROXY_CONFIG[VITE_BASE_API],
|
||||
}
|
||||
: undefined,
|
||||
},
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user