feat: 优化公司数据去重逻辑,扩大检查范围到90天

This commit is contained in:
邹方成 2026-01-14 22:14:33 +08:00
commit 59bfefff0e
304 changed files with 215539 additions and 0 deletions

4
.dockerignore Normal file
View File

@ -0,0 +1,4 @@
web/node_modules
video
local_deploy
clickhouse_data

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
*.html linguist-language=python

21
.gitignore vendored Normal file
View File

@ -0,0 +1,21 @@
__pycache__/
.idea/
venv/
.venv/
.mypy_cache/
.vscode
.ruff_cache/
.pytest_cache/
migrations/
db.sqlite3
db.sqlite3-journal
db.sqlite3-shm
db.sqlite3-wal
.DS_Store
._.DS_Store
clickhouse_data
data
videos
videps

39
Dockerfile Normal file
View File

@ -0,0 +1,39 @@
FROM node:18-alpine AS web
WORKDIR /opt/vue-fastapi-admin
COPY /web ./web
# 安装pnpm并设置配置
RUN npm install -g pnpm && \
cd /opt/vue-fastapi-admin/web && \
pnpm config set registry https://registry.npmmirror.com && \
pnpm install && \
pnpm run build
FROM python:3.11-slim-bullseye
WORKDIR /opt/vue-fastapi-admin
ADD . .
COPY /deploy/entrypoint.sh .
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=core-apt \
--mount=type=cache,target=/var/lib/apt,sharing=locked,id=core-apt \
sed -i "s@http://.*.debian.org@http://mirrors.ustc.edu.cn@g" /etc/apt/sources.list \
&& rm -f /etc/apt/apt.conf.d/docker-clean \
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& echo "Asia/Shanghai" > /etc/timezone \
&& apt-get update \
&& apt-get install -y --no-install-recommends gcc python3-dev bash nginx vim curl procps net-tools
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
COPY --from=web /opt/vue-fastapi-admin/web/dist /opt/vue-fastapi-admin/web/dist
ADD /deploy/web.conf /etc/nginx/sites-available/web.conf
RUN rm -f /etc/nginx/sites-enabled/default \
&& ln -s /etc/nginx/sites-available/web.conf /etc/nginx/sites-enabled/
ENV LANG=zh_CN.UTF-8
EXPOSE 80
ENTRYPOINT [ "sh", "entrypoint.sh" ]

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 mizhexiaoxiao
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

87
Makefile Normal file
View File

@ -0,0 +1,87 @@
# Build configuration
# -------------------
APP_NAME := `sed -n 's/^ *name.*=.*"\([^"]*\)".*/\1/p' pyproject.toml`
APP_VERSION := `sed -n 's/^ *version.*=.*"\([^"]*\)".*/\1/p' pyproject.toml`
GIT_REVISION = `git rev-parse HEAD`
# Introspection targets
# ---------------------
.PHONY: help
help: header targets
.PHONY: header
header:
@echo "\033[34mEnvironment\033[0m"
@echo "\033[34m---------------------------------------------------------------\033[0m"
@printf "\033[33m%-23s\033[0m" "APP_NAME"
@printf "\033[35m%s\033[0m" $(APP_NAME)
@echo ""
@printf "\033[33m%-23s\033[0m" "APP_VERSION"
@printf "\033[35m%s\033[0m" $(APP_VERSION)
@echo ""
@printf "\033[33m%-23s\033[0m" "GIT_REVISION"
@printf "\033[35m%s\033[0m" $(GIT_REVISION)
@echo "\n"
.PHONY: targets
targets:
@echo "\033[34mDevelopment Targets\033[0m"
@echo "\033[34m---------------------------------------------------------------\033[0m"
@perl -nle'print $& if m{^[a-zA-Z_-]+:.*?## .*$$}' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-22s\033[0m %s\n", $$1, $$2}'
# Development targets
# -------------
.PHONY: install
install: ## Install dependencies
uv add pyproject.toml
.PHONY: run
run: start
.PHONY: start
start: ## Starts the server
python run.py
# Check, lint and format targets
# ------------------------------
.PHONY: check
check: check-format lint
.PHONY: check-format
check-format: ## Dry-run code formatter
black ./ --check
isort ./ --profile black --check
.PHONY: lint
lint: ## Run ruff
ruff check ./app
.PHONY: format
format: ## Run code formatter
black ./
isort ./ --profile black
.PHONY: test
test: ## Run the test suite
$(eval include .env)
$(eval export $(sh sed 's/=.*//' .env))
pytest -vv -s --cache-clear ./
.PHONY: clean-db
clean-db: ## 删除migrations文件夹和db.sqlite3
find . -type d -name "migrations" -exec rm -rf {} +
rm -f db.sqlite3 db.sqlite3-shm db.sqlite3-wal
.PHONY: migrate
migrate: ## 运行aerich migrate命令生成迁移文件
aerich migrate
.PHONY: upgrade
upgrade: ## 运行aerich upgrade命令应用迁移
aerich upgrade

74
Pipfile Normal file
View File

@ -0,0 +1,74 @@
[[source]]
url = "https://pypi.doubanio.com/simple"
verify_ssl = true
name = "pip_conf_index_global"
[packages]
aerich = "==0.8.1"
aiosqlite = "==0.20.0"
annotated-types = "==0.7.0"
anyio = "==4.8.0"
argon2-cffi = "==23.1.0"
argon2-cffi-bindings = "==21.2.0"
asyncclick = "==8.1.8"
black = "==24.10.0"
certifi = "==2024.12.14"
cffi = "==1.17.1"
click = "==8.1.8"
dictdiffer = "==0.9.0"
dnspython = "==2.7.0"
email-validator = "==2.2.0"
fastapi = "==0.111.0"
fastapi-cli = "==0.0.7"
h11 = "==0.14.0"
httpcore = "==1.0.7"
httptools = "==0.6.4"
httpx = "==0.28.1"
idna = "==3.10"
iso8601 = "==2.1.0"
isort = "==5.13.2"
jinja2 = "==3.1.5"
loguru = "==0.7.3"
markdown-it-py = "==3.0.0"
markupsafe = "==3.0.2"
mdurl = "==0.1.2"
mypy-extensions = "==1.0.0"
orjson = "==3.10.14"
packaging = "==24.2"
passlib = "==1.7.4"
pathspec = "==0.12.1"
platformdirs = "==4.3.6"
pycparser = "==2.22"
pydantic = "==2.10.5"
pydantic-core = "==2.27.2"
pydantic-settings = "==2.7.1"
pygments = "==2.19.1"
pyjwt = "==2.10.1"
pypika-tortoise = "==0.3.2"
python-dotenv = "==1.0.1"
python-multipart = "==0.0.20"
pytz = "==2024.2"
pyyaml = "==6.0.2"
rich = "==13.9.4"
rich-toolkit = "==0.13.2"
ruff = "==0.9.1"
shellingham = "==1.5.4"
sniffio = "==1.3.1"
starlette = "==0.37.2"
tortoise-orm = "==0.23.0"
typer = "==0.15.1"
typing-extensions = "==4.12.2"
ujson = "==5.10.0"
uvicorn = "==0.34.0"
uvloop = "==0.21.0"
watchfiles = "==1.0.4"
websockets = "==14.1"
asyncpg = "*"
pandas = "*"
openpyxl = "*"
pysocks = "*"
[dev-packages]
[requires]
python_version = "3.13"

1537
Pipfile.lock generated Normal file

File diff suppressed because it is too large Load Diff

22
README-en.md Normal file
View File

@ -0,0 +1,22 @@
docker run -d \
--name clickhouse-server \
--restart=unless-stopped \
--ulimit nofile=262144:262144 \
--ulimit memlock=-1:-1 \
--cap-add=SYS_NICE \
--cap-add=NET_ADMIN \
--cap-add=SYS_RESOURCE \
--security-opt seccomp=unconfined \
--shm-size=8g \
-p 8123:8123 \
-p 9000:9000 \
-p 9004:9004 \
-p 9005:9005 \
-p 9009:9009 \
-e CLICKHOUSE_DB=job_data \
-e CLICKHOUSE_USER=data_user \
-e CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 \
-e CLICKHOUSE_PASSWORD=data_pass \
-v $PWD/ch_data:/var/lib/clickhouse/ \
-v $PWD/ch_logs:/var/log/clickhouse-server/ \
clickhouse/clickhouse-server:latest

24
README.md Normal file
View File

@ -0,0 +1,24 @@
export DOCKER_DEFAULT_PLATFORM=linux/amd64
docker build -t zfc931912343/admin-crawler:v2.1 .
docker push zfc931912343/admin-crawler:v2.1
docker build -t zfc931912343/boss-crawler:v1 .
docker push zfc931912343/boss-crawler:v1
sudo docker rm -f admin-crawler &&sudo docker run -d --restart=always --name=admin-crawler --log-driver=json-file --log-opt max-size=10m --log-opt max-file=7 -p 9999:80 nbg2akd8w5diy8.xuanyuan.run/zfc931912343/admin-crawler:v1.5
docker run -d \
--name mysql-server \
--restart always \
-p 3306:3306 \
-v /opt/mysql/data:/var/lib/mysql \
-e MYSQL_ROOT_PASSWORD=jobdata123 \
-e MYSQL_DATABASE=job_data \
mysql:8.0 \
--character-set-server=utf8mb4 \
--collation-server=utf8mb4_unicode_ci

57
app/__init__.py Normal file
View File

@ -0,0 +1,57 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
import os
from tortoise import Tortoise
from app.core.exceptions import SettingNotFound
from app.core.init_app import (
init_data,
make_middlewares,
register_exceptions,
register_routers,
)
from app.core.clickhouse import clickhouse_manager
from app.core.scheduler import start_scheduler, shutdown_scheduler
try:
from app.settings.config import settings
except ImportError:
raise SettingNotFound("Can not import settings")
@asynccontextmanager
async def lifespan(app: FastAPI):
await Tortoise.init(config=settings.TORTOISE_ORM)
await Tortoise.generate_schemas()
await init_data()
start_scheduler()
yield
# 清理所有数据库连接
await Tortoise.close_connections()
await clickhouse_manager.close()
shutdown_scheduler()
def create_app() -> FastAPI:
app = FastAPI(
title=settings.APP_TITLE,
description=settings.APP_DESCRIPTION,
version=settings.VERSION,
openapi_url="/openapi.json",
middleware=make_middlewares(),
lifespan=lifespan,
)
register_exceptions(app)
register_routers(app, prefix="/api")
# Mount static files
static_dir = os.path.join(settings.BASE_DIR, "static")
if os.path.exists(static_dir):
app.mount("/static", StaticFiles(directory=static_dir), name="static")
return app
app = create_app()

9
app/api/__init__.py Normal file
View File

@ -0,0 +1,9 @@
from fastapi import APIRouter
from .v1 import v1_router
api_router = APIRouter()
api_router.include_router(v1_router, prefix="/v1")
__all__ = ["api_router"]

39
app/api/v1/__init__.py Normal file
View File

@ -0,0 +1,39 @@
from fastapi import APIRouter
from app.core.dependency import DependPermission
from .apis import apis_router
from .auditlog import auditlog_router
from .base import base_router
from .depts import depts_router
from .menus import menus_router
from .roles import roles_router
from .users import users_router
from .token import token_router
from .proxy import proxy_router
from .job import job_router
from .stats import stats_router
from .pipeline import pipeline_router
from .keyword import keyword_router
from .cleaning import cleaning_router
from .analytics import router as analytics_router
v1_router = APIRouter()
v1_router.include_router(base_router, prefix="/base")
v1_router.include_router(users_router, prefix="/user", dependencies=[DependPermission])
v1_router.include_router(roles_router, prefix="/role", dependencies=[DependPermission])
v1_router.include_router(menus_router, prefix="/menu", dependencies=[DependPermission])
v1_router.include_router(apis_router, prefix="/api", dependencies=[DependPermission])
v1_router.include_router(depts_router, prefix="/dept", dependencies=[DependPermission])
v1_router.include_router(auditlog_router, prefix="/auditlog", dependencies=[DependPermission])
v1_router.include_router(job_router, prefix="/job", tags=["数据入库"])
v1_router.include_router(job_router, prefix="/universal", tags=["通用数据接口"])
v1_router.include_router(token_router, prefix="/token", tags=["Token管理"])
v1_router.include_router(proxy_router, prefix="/proxy", tags=["代理IP管理"])
v1_router.include_router(stats_router, prefix="/stats")
v1_router.include_router(pipeline_router, prefix="/pipeline")
v1_router.include_router(keyword_router, prefix="/keyword")
v1_router.include_router(cleaning_router, prefix="/cleaning", dependencies=[DependPermission])
v1_router.include_router(analytics_router, prefix="/analytics", tags=["数据分析"])

92
app/api/v1/analytics.py Normal file
View File

@ -0,0 +1,92 @@
from typing import Optional, List
from datetime import datetime, date, timezone
try:
from zoneinfo import ZoneInfo
except ImportError:
from backports.zoneinfo import ZoneInfo
from fastapi import APIRouter, Depends, Query
from app.core.clickhouse import clickhouse_manager
from app.services.analytics_service import AnalyticsService
from app.schemas.analytics import (
JobStatisticsResponse,
)
router = APIRouter()
CHINA_TZ = ZoneInfo("Asia/Shanghai")
async def get_analytics_service() -> AnalyticsService:
client = await clickhouse_manager.get_client()
return AnalyticsService(client)
def to_utc(dt: datetime) -> datetime:
"""将本地(上海)时间转换为UTC"""
if dt.tzinfo is None:
dt = dt.replace(tzinfo=CHINA_TZ)
return dt.astimezone(timezone.utc)
@router.get("/overview", response_model=JobStatisticsResponse, summary="获取职位统计总览")
async def get_overview(
from_date: Optional[date] = None,
to_date: Optional[date] = None,
city: Optional[str] = None,
service: AnalyticsService = Depends(get_analytics_service)
):
from_dt = to_utc(datetime.combine(from_date, datetime.min.time())) if from_date else None
to_dt = to_utc(datetime.combine(to_date, datetime.max.time())) if to_date else None
filters = {}
if city:
filters["city"] = city
return await service.get_job_statistics(filters=filters, from_dt=from_dt, to_dt=to_dt)
@router.get("/trend/volume", summary="获取数据量趋势")
async def get_volume_trend(
interval: str = Query("day", regex="^(day|hour|week|month)$"),
from_date: Optional[date] = None,
to_date: Optional[date] = None,
from_datetime: Optional[datetime] = None,
to_datetime: Optional[datetime] = None,
service: AnalyticsService = Depends(get_analytics_service)
):
# 兼容小时粒度的精确时间窗口,其它粒度按日期转换为起止时间
if from_datetime:
from_dt = to_utc(from_datetime)
elif from_date:
from_dt = to_utc(datetime.combine(from_date, datetime.min.time()))
else:
from_dt = None
if to_datetime:
to_dt = to_utc(to_datetime)
elif to_date:
to_dt = to_utc(datetime.combine(to_date, datetime.max.time()))
else:
to_dt = None
return await service.get_volume_trend(interval=interval, from_dt=from_dt, to_dt=to_dt)
@router.get("/distribution/source", summary="获取数据来源分布")
async def get_source_distribution(
from_date: Optional[date] = None,
to_date: Optional[date] = None,
from_datetime: Optional[datetime] = None,
to_datetime: Optional[datetime] = None,
service: AnalyticsService = Depends(get_analytics_service)
):
if from_datetime:
from_dt = to_utc(from_datetime)
elif from_date:
from_dt = to_utc(datetime.combine(from_date, datetime.min.time()))
else:
from_dt = None
if to_datetime:
to_dt = to_utc(to_datetime)
elif to_date:
to_dt = to_utc(datetime.combine(to_date, datetime.max.time()))
else:
to_dt = None
return await service.get_source_distribution(from_dt=from_dt, to_dt=to_dt)

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .apis import router
apis_router = APIRouter()
apis_router.include_router(router, tags=["API模块"])
__all__ = ["apis_router"]

67
app/api/v1/apis/apis.py Normal file
View File

@ -0,0 +1,67 @@
from fastapi import APIRouter, Query
from tortoise.expressions import Q
from app.controllers.api import api_controller
from app.schemas import Success, SuccessExtra
from app.schemas.apis import *
router = APIRouter()
@router.get("/list", summary="查看API列表")
async def list_api(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
path: str = Query(None, description="API路径"),
summary: str = Query(None, description="API简介"),
tags: str = Query(None, description="API模块"),
):
q = Q()
if path:
q &= Q(path__contains=path)
if summary:
q &= Q(summary__contains=summary)
if tags:
q &= Q(tags__contains=tags)
total, api_objs = await api_controller.list(page=page, page_size=page_size, search=q, order=["tags", "id"])
data = [await obj.to_dict() for obj in api_objs]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.get("/get", summary="查看Api")
async def get_api(
id: int = Query(..., description="Api"),
):
api_obj = await api_controller.get(id=id)
data = await api_obj.to_dict()
return Success(data=data)
@router.post("/create", summary="创建Api")
async def create_api(
api_in: ApiCreate,
):
await api_controller.create(obj_in=api_in)
return Success(msg="Created Successfully")
@router.post("/update", summary="更新Api")
async def update_api(
api_in: ApiUpdate,
):
await api_controller.update(id=api_in.id, obj_in=api_in)
return Success(msg="Update Successfully")
@router.delete("/delete", summary="删除Api")
async def delete_api(
api_id: int = Query(..., description="ApiID"),
):
await api_controller.remove(id=api_id)
return Success(msg="Deleted Success")
@router.post("/refresh", summary="刷新API列表")
async def refresh_api():
await api_controller.refresh_api()
return Success(msg="OK")

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .auditlog import router
auditlog_router = APIRouter()
auditlog_router.include_router(router, tags=["审计日志模块"])
__all__ = ["auditlog_router"]

View File

@ -0,0 +1,48 @@
from datetime import datetime
from fastapi import APIRouter, Query
from tortoise.expressions import Q
from app.models.admin import AuditLog
from app.schemas import SuccessExtra
from app.schemas.apis import *
router = APIRouter()
@router.get("/list", summary="查看操作日志")
async def get_audit_log_list(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
username: str = Query("", description="操作人名称"),
module: str = Query("", description="功能模块"),
method: str = Query("", description="请求方法"),
summary: str = Query("", description="接口描述"),
path: str = Query("", description="请求路径"),
status: int = Query(None, description="状态码"),
start_time: datetime = Query("", description="开始时间"),
end_time: datetime = Query("", description="结束时间"),
):
q = Q()
if username:
q &= Q(username__icontains=username)
if module:
q &= Q(module__icontains=module)
if method:
q &= Q(method__icontains=method)
if summary:
q &= Q(summary__icontains=summary)
if path:
q &= Q(path__icontains=path)
if status:
q &= Q(status=status)
if start_time and end_time:
q &= Q(created_at__range=[start_time, end_time])
elif start_time:
q &= Q(created_at__gte=start_time)
elif end_time:
q &= Q(created_at__lte=end_time)
audit_log_objs = await AuditLog.filter(q).offset((page - 1) * page_size).limit(page_size).order_by("-created_at")
total = await AuditLog.filter(q).count()
data = [await audit_log.to_dict() for audit_log in audit_log_objs]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .base import router
base_router = APIRouter()
base_router.include_router(router, tags=["基础模块"])
__all__ = ["base_router"]

103
app/api/v1/base/base.py Normal file
View File

@ -0,0 +1,103 @@
from datetime import datetime, timedelta, timezone
from fastapi import APIRouter
from app.controllers.user import user_controller
from app.core.ctx import CTX_USER_ID
from app.core.dependency import DependAuth
from app.models.admin import Api, Menu, Role, User
from app.schemas.base import Fail, Success
from app.schemas.login import *
from app.schemas.users import UpdatePassword
from app.settings import settings
from app.utils.jwt_utils import create_access_token
from app.utils.password import get_password_hash, verify_password
router = APIRouter()
@router.post("/access_token", summary="获取token")
async def login_access_token(credentials: CredentialsSchema):
user: User = await user_controller.authenticate(credentials)
await user_controller.update_last_login(user.id)
access_token_expires = timedelta(minutes=settings.JWT_ACCESS_TOKEN_EXPIRE_MINUTES)
expire = datetime.now(timezone.utc) + access_token_expires
data = JWTOut(
access_token=create_access_token(
data=JWTPayload(
user_id=user.id,
username=user.username,
is_superuser=user.is_superuser,
exp=expire,
)
),
username=user.username,
)
return Success(data=data.model_dump())
@router.get("/userinfo", summary="查看用户信息", dependencies=[DependAuth])
async def get_userinfo():
user_id = CTX_USER_ID.get()
user_obj = await user_controller.get(id=user_id)
data = await user_obj.to_dict(exclude_fields=["password"])
data["avatar"] = "https://avatars.githubusercontent.com/u/54677442?v=4"
return Success(data=data)
@router.get("/usermenu", summary="查看用户菜单", dependencies=[DependAuth])
async def get_user_menu():
user_id = CTX_USER_ID.get()
user_obj = await User.filter(id=user_id).first()
menus: list[Menu] = []
if user_obj.is_superuser:
menus = await Menu.all()
else:
role_objs: list[Role] = await user_obj.roles
for role_obj in role_objs:
menu = await role_obj.menus
menus.extend(menu)
menus = list(set(menus))
parent_menus: list[Menu] = []
for menu in menus:
if menu.parent_id == 0:
parent_menus.append(menu)
res = []
for parent_menu in parent_menus:
parent_menu_dict = await parent_menu.to_dict()
parent_menu_dict["children"] = []
for menu in menus:
if menu.parent_id == parent_menu.id:
parent_menu_dict["children"].append(await menu.to_dict())
res.append(parent_menu_dict)
return Success(data=res)
@router.get("/userapi", summary="查看用户API", dependencies=[DependAuth])
async def get_user_api():
user_id = CTX_USER_ID.get()
user_obj = await User.filter(id=user_id).first()
if user_obj.is_superuser:
api_objs: list[Api] = await Api.all()
apis = [api.method.lower() + api.path for api in api_objs]
return Success(data=apis)
role_objs: list[Role] = await user_obj.roles
apis = []
for role_obj in role_objs:
api_objs: list[Api] = await role_obj.apis
apis.extend([api.method.lower() + api.path for api in api_objs])
apis = list(set(apis))
return Success(data=apis)
@router.post("/update_password", summary="修改密码", dependencies=[DependAuth])
async def update_user_password(req_in: UpdatePassword):
user_id = CTX_USER_ID.get()
user = await user_controller.get(user_id)
verified = verify_password(req_in.old_password, user.password)
if not verified:
return Fail(msg="旧密码验证错误!")
user.password = get_password_hash(req_in.new_password)
await user.save()
return Success(msg="修改成功")

View File

@ -0,0 +1,5 @@
from fastapi import APIRouter
from .cleaning import router
cleaning_router = APIRouter()
cleaning_router.include_router(router, tags=["数据清洗"])

View File

@ -0,0 +1,321 @@
from fastapi import APIRouter, File, UploadFile, Form, Body, Query
from app.services.cleaning import CleaningService
from app.services.company_cleaner import company_cleaner
from app.controllers.cleaning import cleaning_controller
from app.schemas import Success, SuccessExtra
from app.models.cleaning import CleaningTask
from app.core.clickhouse import clickhouse_manager
from tortoise.expressions import Q
from typing import Optional
import json
router = APIRouter()
cleaning_service = CleaningService()
@router.get("/stats", summary="获取公司清洗统计信息")
async def get_stats():
"""获取 ClickHouse 中待处理公司的统计信息"""
client = await clickhouse_manager.get_client()
pending_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'pending'"
pending_res = await client.query(pending_sql)
pending_count = pending_res.result_rows[0][0] if pending_res.result_rows else 0
today_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'done' AND toDate(updated_at) = today()"
today_res = await client.query(today_sql)
today_count = today_res.result_rows[0][0] if today_res.result_rows else 0
dist_sql = """
SELECT source, status, count()
FROM job_data.pending_company FINAL
GROUP BY source, status
ORDER BY source, status
"""
dist_res = await client.query(dist_sql)
stats = {
"total_pending": pending_count,
"today_processed": today_count,
"details": []
}
# Process distribution
source_stats = {}
for row in dist_res.result_rows:
source, status, count = row
if source not in source_stats:
source_stats[source] = {"pending": 0, "done": 0, "failed": 0, "total": 0}
if status in source_stats[source]:
source_stats[source][status] = count
source_stats[source]["total"] += count
stats["details"] = [
{"source": k, **v} for k, v in source_stats.items()
]
return Success(data=stats)
@router.get("/companies", summary="获取公司清洗列表")
async def get_companies_list(
page: int = Query(1, ge=1),
page_size: int = Query(20, ge=1, le=100),
source: Optional[str] = Query(None),
status: Optional[str] = Query(None)
):
"""分页获取待处理公司列表详情"""
client = await clickhouse_manager.get_client()
offset = (page - 1) * page_size
where_clauses = []
if source:
where_clauses.append(f"source = '{source}'")
if status:
where_clauses.append(f"status = '{status}'")
where_sql = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
# Count
count_sql = f"SELECT count() FROM job_data.pending_company FINAL {where_sql}"
count_res = await client.query(count_sql)
total = count_res.result_rows[0][0] if count_res.result_rows else 0
# Data
sql = f"""
SELECT source, company_id, company_name, status, error_msg, created_at, updated_at
FROM job_data.pending_company FINAL
{where_sql}
ORDER BY updated_at DESC
LIMIT {page_size} OFFSET {offset}
"""
res = await client.query(sql)
data = []
for row in res.result_rows:
data.append({
"source": row[0],
"company_id": row[1],
"company_name": row[2],
"status": row[3],
"error_msg": row[4],
"created_at": row[5].isoformat() if row[5] else None,
"updated_at": row[6].isoformat() if row[6] else None
})
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.get("/company-detail", summary="获取公司清洗详情")
async def get_company_cleaning_detail(
source: str = Query(..., description="数据源"),
company_id: str = Query(..., description="公司ID"),
company_name: Optional[str] = Query(None, description="公司名称"),
):
client = await clickhouse_manager.get_client()
table_map = {
"boss": "boss_company",
"qcwy": "qcwy_company",
"zhilian": "zhilian_company",
}
table = table_map.get(source)
if not table:
return Success(code=400, msg="不支持的数据源")
if source == "qcwy":
sql = f"""
SELECT json_data, company_name, created_at, updated_at
FROM job_data.{table}
WHERE JSONExtractString(json_data, 'companyId') = {{company_id:String}}
OR JSONExtractString(json_data, 'coId') = {{company_id:String}}
OR JSONExtractString(json_data, 'coinfo', 'coid') = {{company_id:String}}
OR company_name = {{company_name:String}}
ORDER BY updated_at DESC
LIMIT 1
"""
params = {
"company_id": str(company_id),
"company_name": str(company_name or ""),
}
else:
if not company_name:
return Success(code=400, msg="缺少公司名称")
sql = f"""
SELECT json_data, company_name, created_at, updated_at
FROM job_data.{table}
WHERE company_name = {{company_name:String}}
ORDER BY updated_at DESC
LIMIT 1
"""
params = {"company_name": str(company_name)}
print(f"DEBUG: Executing SQL: {sql}")
print(f"DEBUG: Params: {params}")
res = await client.query(sql, parameters=params)
if not res.result_rows:
return Success(code=404, msg="未找到公司清洗结果")
row = res.result_rows[0]
raw_json = row[0]
try:
data = json.loads(raw_json)
except Exception:
data = {"raw": raw_json}
return Success(
data={
"source": source,
"company_id": company_id,
"company_name": row[1],
"created_at": row[2].isoformat() if row[2] else None,
"updated_at": row[3].isoformat() if row[3] else None,
"data": data,
}
)
@router.post("/collect-pending", summary="分析待处理数据")
async def collect_pending_companies_api(
limit: int = Body(1000, embed=True, ge=1, le=10000),
source: Optional[str] = Body(None, embed=True)
):
"""
分析招聘数据收集待处理的公司ID到 pending_company
"""
await company_cleaner.collect_pending_companies(limit=limit, source=source)
return Success(msg=f"已完成数据分析,已收集待处理公司(上限 {limit} 条)")
@router.post("/run-pending", summary="手动执行待处理公司清洗")
async def run_pending_companies(
limit: int = Body(100, embed=True, ge=1, le=5000),
source: Optional[str] = Body(None, embed=True),
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
"""
手动触发待处理公司清洗任务
仅会处理当前状态为 pending 的记录已经处理过的记录不会重复执行
"""
await company_cleaner.process_pending_companies(
limit=limit,
source=source,
proxy=proxy,
max_delay_seconds=max_delay_seconds,
)
return Success(msg=f"已触发执行最近 {limit} 条待处理公司清洗任务")
@router.post("/crawl-execute", summary="爬取并执行待处理公司清洗")
async def crawl_execute_pending(
limit: int = Body(100, embed=True, ge=1, le=5000),
source: Optional[str] = Body(None, embed=True),
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
await company_cleaner.collect_pending_companies(source=source)
await company_cleaner.process_pending_companies(
limit=limit,
source=source,
proxy=proxy,
max_delay_seconds=max_delay_seconds,
)
return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务")
@router.post("/process-company", summary="执行单个公司清洗任务")
async def process_single_company_api(
source: str = Body(..., embed=True),
company_id: str = Body(..., embed=True),
proxy: Optional[str] = Body(None, embed=True),
max_delay_seconds: int = Body(5, embed=True),
):
result = await company_cleaner.process_single_company(
source=source,
company_id=company_id,
proxy=proxy,
max_delay_seconds=max_delay_seconds,
)
success = bool(result.get("success"))
msg = "任务执行成功" if success else "任务执行失败"
return Success(msg=msg, data=result)
@router.post("/upload", summary="上传文件并保存任务")
async def upload_file(
file: UploadFile = File(...),
clean_type: str = Form("auto"),
platform: str = Form("auto"),
proxy: Optional[str] = Form(None)
):
targets = await cleaning_service.parse_file(file)
tasks = [
CleaningTask(
target=t,
clean_type=clean_type,
platform=platform,
proxy=proxy,
status="pending"
) for t in targets
]
if tasks:
await CleaningTask.bulk_create(tasks)
return Success(msg=f"Successfully imported {len(tasks)} tasks")
@router.get("/list", summary="获取清洗任务列表")
async def list_tasks(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
target: str = Query(None, description="目标搜索"),
status: str = Query(None, description="状态筛选"),
clean_type: str = Query(None, description="清洗类型筛选")
):
q = Q()
if target:
q &= Q(target__contains=target)
if status:
q &= Q(status=status)
if clean_type:
q &= Q(clean_type=clean_type)
total, tasks = await cleaning_controller.list(page=page, page_size=page_size, search=q, order=["-created_at"])
data = [await t.to_dict() for t in tasks]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.post("/process/{task_id}", summary="处理单个任务")
async def process_task(task_id: int):
task = await cleaning_controller.get(id=task_id)
if not task:
return Success(code=404, msg="Task not found")
task.status = "processing"
await task.save()
result = await cleaning_service.process_single_item(
target=task.target,
clean_type=task.clean_type,
platform=task.platform,
proxy=task.proxy,
)
task.status = "success" if result.get("success") else "fail"
task.storage_status = result.get("storage_status", "unknown")
task.remote_sent = result.get("remote_sent", False)
task.result_summary = result.get("data_summary")
task.error_msg = result.get("error")
await task.save()
return Success(data=await task.to_dict(), msg="Task processed")
@router.delete("/delete", summary="删除任务")
async def delete_task(
id: int = Query(..., description="任务ID")
):
await cleaning_controller.remove(id=id)
return Success(msg="Deleted Successfully")
@router.post("/clear", summary="清空所有任务")
async def clear_tasks():
await CleaningTask.all().delete()
return Success(msg="All tasks cleared")

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .depts import router
depts_router = APIRouter()
depts_router.include_router(router, tags=["部门模块"])
__all__ = ["depts_router"]

48
app/api/v1/depts/depts.py Normal file
View File

@ -0,0 +1,48 @@
from fastapi import APIRouter, Query
from app.controllers.dept import dept_controller
from app.schemas import Success
from app.schemas.depts import *
router = APIRouter()
@router.get("/list", summary="查看部门列表")
async def list_dept(
name: str = Query(None, description="部门名称"),
):
dept_tree = await dept_controller.get_dept_tree(name)
return Success(data=dept_tree)
@router.get("/get", summary="查看部门")
async def get_dept(
id: int = Query(..., description="部门ID"),
):
dept_obj = await dept_controller.get(id=id)
data = await dept_obj.to_dict()
return Success(data=data)
@router.post("/create", summary="创建部门")
async def create_dept(
dept_in: DeptCreate,
):
await dept_controller.create_dept(obj_in=dept_in)
return Success(msg="Created Successfully")
@router.post("/update", summary="更新部门")
async def update_dept(
dept_in: DeptUpdate,
):
await dept_controller.update_dept(obj_in=dept_in)
return Success(msg="Update Successfully")
@router.delete("/delete", summary="删除部门")
async def delete_dept(
dept_id: int = Query(..., description="部门ID"),
):
await dept_controller.delete_dept(dept_id=dept_id)
return Success(msg="Deleted Success")

View File

@ -0,0 +1,46 @@
from typing import Optional, List, Dict, Any
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel, Field
from clickhouse_connect.driver import AsyncClient
from app.core.clickhouse import clickhouse_manager
from app.services.ingest_service import IngestService
router = APIRouter()
class IngestSingleRequest(BaseModel):
platform: str = Field(...)
data_type: str = Field(...)
data: Dict[str, Any] = Field(...)
check_duplicate: bool = Field(True)
class IngestBatchRequest(BaseModel):
platform: str = Field(...)
data_type: str = Field(...)
data_list: List[Dict[str, Any]] = Field(...)
check_duplicate: bool = Field(True)
async def get_service() -> IngestService:
client: AsyncClient = await clickhouse_manager.get_client()
return IngestService(client)
@router.post("/data")
async def ingest_data(req: IngestSingleRequest, service: IngestService = Depends(get_service)):
try:
res = await service.store_single(req.platform, req.data_type, req.data, req.check_duplicate)
return {"code": 200, "data": res, "message": "ok"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/batch")
async def ingest_batch(req: IngestBatchRequest, service: IngestService = Depends(get_service)):
try:
res = await service.store_batch(req.platform, req.data_type, req.data_list, req.check_duplicate)
return {"code": 200, "data": res, "message": "ok"}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .job import router
job_router = APIRouter()
job_router.include_router(router, tags=["数据上报"])
__all__ = ["job_router"]

252
app/api/v1/job/job.py Normal file
View File

@ -0,0 +1,252 @@
from fastapi import APIRouter, BackgroundTasks, Depends
from typing import Dict, Any
from app.services.job import create_data_router_service, PlatformType, DataType
from app.core.clickhouse import clickhouse_manager
from app.controllers.job import (
UniversalDataController,
UniversalDataRequest,
BatchDataRequest,
create_universal_data_controller
)
router = APIRouter(tags=["通用数据接口"])
async def get_universal_data_controller() -> UniversalDataController:
"""获取通用数据控制器实例"""
clickhouse_client = await clickhouse_manager.get_client()
data_router_service = create_data_router_service(clickhouse_client)
return create_universal_data_controller(data_router_service)
@router.post("/data/store", summary="存储单条数据")
async def store_single_data(
request: UniversalDataRequest,
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""
通用数据存储接口 - 存储单条数据
支持的平台:
- boss: Boss直聘
- qcwy: 前程无忧
- zhilian: 智联招聘
支持的数据类型:
- job: 职位数据
- company: 公司数据
示例请求:
```json
{
"data": {
"jobBaseInfoVO": {
"encryptJobId": "abc123",
"positionName": "Python开发工程师",
"locationName": "北京"
},
"brandComInfoVO": {
"brandName": "某科技公司",
"industryName": "互联网"
}
},
"data_type": "job",
"platform": "boss",
"check_duplicate": true,
"duplicate_key": "encrypt_job_id"
}
```
"""
return await controller.store_single_data(request)
@router.post("/data/batch-store", summary="批量存储数据")
async def store_batch_data(
request: BatchDataRequest,
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""
通用数据存储接口 - 批量存储数据
示例请求:
```json
{
"data_list": [
{
"jobBaseInfoVO": {
"encryptJobId": "abc123",
"positionName": "Python开发工程师"
}
},
{
"jobBaseInfoVO": {
"encryptJobId": "def456",
"positionName": "Java开发工程师"
}
}
],
"data_type": "job",
"platform": "boss",
"check_duplicate": true
}
```
"""
return await controller.store_batch_data(request)
@router.post("/data/store-async", summary="异步存储单条数据")
async def store_single_data_async(
request: UniversalDataRequest,
background_tasks: BackgroundTasks,
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""
通用数据存储接口 - 异步存储单条数据
适用于大量数据或不需要立即返回结果的场景
"""
return await controller.store_single_data_async(background_tasks, request)
@router.post("/data/batch-store-async", summary="异步批量存储数据")
async def store_batch_data_async(
request: BatchDataRequest,
background_tasks: BackgroundTasks,
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""
通用数据存储接口 - 异步批量存储数据
适用于大批量数据处理场景
"""
return await controller.store_batch_data_async(background_tasks, request)
@router.get("/data", summary="查询数据")
async def query_data(
platform: str,
data_type: str,
page: int = 1,
page_size: int = 20,
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""
通用数据查询接口
参数:
- platform: 平台类型 (boss/qcwy/zhilian)
- data_type: 数据类型 (job/company)
- page: 页码默认1
- page_size: 每页大小默认20
"""
# 转换字符串参数为枚举类型
platform_enum = PlatformType(platform)
data_type_enum = DataType(data_type)
return await controller.query_data(platform_enum, data_type_enum, page, page_size)
@router.get("/platforms", summary="获取支持的平台和数据类型")
async def get_supported_platforms(
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""
获取支持的平台和数据类型信息
返回:
- 支持的平台列表
- 支持的数据类型列表
- 各平台的默认重复检查字段
"""
return await controller.get_supported_platforms()
# 为了兼容性,提供平台特定的路由别名
@router.post("/boss/job", summary="Boss直聘职位数据存储")
async def store_boss_job_data(
data: Dict[str, Any],
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""Boss直聘职位数据存储的便捷接口"""
request = UniversalDataRequest(
data=data,
data_type="job",
platform="boss",
check_duplicate=True,
)
return await controller.store_single_data(request)
@router.post("/boss/company", summary="Boss直聘公司数据存储")
async def store_boss_company_data(
data: Dict[str, Any],
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""Boss直聘公司数据存储的便捷接口"""
request = UniversalDataRequest(
data=data,
data_type="company",
platform="boss",
check_duplicate=True,
)
return await controller.store_single_data(request)
@router.post("/qcwy/job", summary="前程无忧职位数据存储")
async def store_qcwy_job_data(
data: Dict[str, Any],
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""前程无忧职位数据存储的便捷接口"""
request = UniversalDataRequest(
data=data,
data_type="job",
platform="qcwy",
check_duplicate=True,
)
return await controller.store_single_data(request)
@router.post("/qcwy/company", summary="前程无忧公司数据存储")
async def store_qcwy_company_data(
data: Dict[str, Any],
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""前程无忧公司数据存储的便捷接口"""
request = UniversalDataRequest(
data=data,
data_type="company",
platform="qcwy",
check_duplicate=True,
)
return await controller.store_single_data(request)
@router.post("/zhilian/job", summary="智联招聘职位数据存储")
async def store_zhilian_job_data(
data: Dict[str, Any],
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""智联招聘职位数据存储的便捷接口"""
request = UniversalDataRequest(
data=data,
data_type="job",
platform="zhilian",
check_duplicate=True,
)
return await controller.store_single_data(request)
@router.post("/zhilian/company", summary="智联招聘公司数据存储")
async def store_zhilian_company_data(
data: Dict[str, Any],
controller: UniversalDataController = Depends(get_universal_data_controller)
) -> Dict[str, Any]:
"""智联招聘公司数据存储的便捷接口"""
request = UniversalDataRequest(
data=data,
data_type="company",
platform="zhilian",
check_duplicate=True,
)
return await controller.store_single_data(request)

View File

@ -0,0 +1,9 @@
from fastapi import APIRouter
from .keyword import router
keyword_router = APIRouter()
keyword_router.include_router(router, tags=["关键词接口"])
__all__ = ["keyword_router"]

View File

@ -0,0 +1,182 @@
from typing import Any, Dict, List
from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel, Field
from app.controllers.keyword import KeywordController
from app.core.dependency import DependPermission
from app.schemas.keyword import KeywordCreate, KeywordUpdate
router = APIRouter(tags=["关键词接口"])
class MarkUsedRequest(BaseModel):
source: str = Field(pattern="^(boss|qcwy|zhilian)$")
ids: List[int]
class StatsQuery(BaseModel):
source: str = Field(pattern="^(boss|qcwy|zhilian)$")
date: str | None = None
async def get_keyword_controller() -> KeywordController:
"""获取关键词控制器实例
返回:
关键词控制器实例
"""
return KeywordController()
@router.get("/available", summary="获取当天未使用的检索条件")
async def get_available(
source: str,
limit: int = 1,
reserve: bool = True,
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""根据平台获取当天未使用的检索条件
参数:
source: 平台标识boss|qcwy|zhilian
limit: 返回数量上限默认1
返回:
标准字典结构包含 items/total/limit
"""
return await controller.get_available(source, limit, reserve)
@router.post("/mark-used", summary="将检索条件标记为今日已使用")
async def mark_used(
request: MarkUsedRequest,
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""将指定检索条件标记为今日已使用
参数:
request: 包含平台标识与记录ID列表的请求体
返回:
更新结果包含成功条数与日期
"""
return await controller.mark_used(request.source, request.ids)
@router.get("/stats", summary="统计使用与未使用数量")
async def get_stats(
source: str,
date: str | None = None,
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""统计指定平台在某日期的使用与未使用数量
参数:
source: 平台标识boss|qcwy|zhilian
date: 统计日期格式 YYYY-MM-DD不传则为今天
返回:
标准字典结构包含 total/used/unused
"""
from datetime import date as _date
d = None
if date:
try:
y, m, d0 = map(int, date.split("-"))
d = _date(y, m, d0)
except Exception:
d = None
return await controller.get_stats(source, d)
@router.get("/overview", summary="获取所有平台统计概览", dependencies=[DependPermission])
async def get_overview(
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""获取所有平台统计概览
返回:
各平台统计数据
"""
return await controller.get_overview_stats()
@router.get("/list", summary="获取关键词列表", dependencies=[DependPermission])
async def list_keywords(
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
page: int = 1,
page_size: int = 20,
city: str | None = None,
job: str | None = None,
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""获取关键词列表
参数:
source: 平台标识
page: 页码
page_size: 每页数量
city: 城市过滤
job: 职位过滤
返回:
列表数据
"""
return await controller.list_keywords(source, page, page_size, city, job)
@router.post("/create", summary="创建关键词", dependencies=[DependPermission])
async def create_keyword(
item: KeywordCreate,
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""创建关键词
参数:
item: 关键词数据
source: 平台标识
返回:
创建结果
"""
return await controller.create_keyword(source, item)
@router.put("/update", summary="更新关键词", dependencies=[DependPermission])
async def update_keyword(
id: int,
item: KeywordUpdate,
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""更新关键词
参数:
id: 记录ID
item: 更新数据
source: 平台标识
返回:
更新结果
"""
return await controller.update_keyword(source, id, item)
@router.delete("/delete", summary="删除关键词", dependencies=[DependPermission])
async def delete_keyword(
id: int,
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
controller: KeywordController = Depends(get_keyword_controller),
) -> Dict[str, Any]:
"""删除关键词
参数:
id: 记录ID
source: 平台标识
返回:
删除结果
"""
return await controller.delete_keyword(source, id)

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .menus import router
menus_router = APIRouter()
menus_router.include_router(router, tags=["菜单模块"])
__all__ = ["menus_router"]

63
app/api/v1/menus/menus.py Normal file
View File

@ -0,0 +1,63 @@
import logging
from fastapi import APIRouter, Query
from app.controllers.menu import menu_controller
from app.schemas.base import Fail, Success, SuccessExtra
from app.schemas.menus import *
logger = logging.getLogger(__name__)
router = APIRouter()
@router.get("/list", summary="查看菜单列表")
async def list_menu(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
):
async def get_menu_with_children(menu_id: int):
menu = await menu_controller.model.get(id=menu_id)
menu_dict = await menu.to_dict()
child_menus = await menu_controller.model.filter(parent_id=menu_id).order_by("order")
menu_dict["children"] = [await get_menu_with_children(child.id) for child in child_menus]
return menu_dict
parent_menus = await menu_controller.model.filter(parent_id=0).order_by("order")
res_menu = [await get_menu_with_children(menu.id) for menu in parent_menus]
return SuccessExtra(data=res_menu, total=len(res_menu), page=page, page_size=page_size)
@router.get("/get", summary="查看菜单")
async def get_menu(
menu_id: int = Query(..., description="菜单id"),
):
result = await menu_controller.get(id=menu_id)
return Success(data=result)
@router.post("/create", summary="创建菜单")
async def create_menu(
menu_in: MenuCreate,
):
await menu_controller.create(obj_in=menu_in)
return Success(msg="Created Success")
@router.post("/update", summary="更新菜单")
async def update_menu(
menu_in: MenuUpdate,
):
await menu_controller.update(id=menu_in.id, obj_in=menu_in)
return Success(msg="Updated Success")
@router.delete("/delete", summary="删除菜单")
async def delete_menu(
id: int = Query(..., description="菜单id"),
):
child_menu_count = await menu_controller.model.filter(parent_id=id).count()
if child_menu_count > 0:
return Fail(msg="Cannot delete a menu with child menus")
await menu_controller.remove(id=id)
return Success(msg="Deleted Success")

18
app/api/v1/pipeline.py Normal file
View File

@ -0,0 +1,18 @@
from fastapi import APIRouter
from app.core.scheduler import ecs_full_pipeline_job
pipeline_router = APIRouter(tags=["ECS任务"])
@pipeline_router.get("/trigger", summary="立即触发ECS全流程任务无需鉴权")
async def trigger_ecs_pipeline():
"""
主动触发一次 ECS 全流程任务删除创建安装云助手执行命令
参数
返回执行状态提示
用途提供手动触发入口便于在控制台或外部系统调用
"""
await ecs_full_pipeline_job()
return {"code": 200, "message": "ECS 全流程任务已触发执行"}

View File

@ -0,0 +1,10 @@
from fastapi import APIRouter
from .proxy import proxy_router as proxy_sub_router
proxy_router = APIRouter()
proxy_router.include_router(proxy_sub_router, tags=["代理IP管理"])
__all__ = ["proxy_router"]

174
app/api/v1/proxy/proxy.py Normal file
View File

@ -0,0 +1,174 @@
from typing import Optional, Dict, Any
from fastapi import APIRouter, Query, HTTPException
from tortoise.transactions import in_transaction
from app.models.cleaning import ProxyConfig, ProxyProvider
from app.schemas.base import Success, SuccessExtra
proxy_router = APIRouter()
@proxy_router.get("/configs")
async def list_proxy_configs(
name: Optional[str] = Query(None, description="名称"),
platform: Optional[str] = Query(None, description="平台标识"),
proxy_type: Optional[str] = Query(None, description="代理类型: http/socks/tunnel"),
is_active: Optional[bool] = Query(None, description="是否启用"),
page: int = Query(1, ge=1, description="页码"),
page_size: int = Query(10, ge=1, le=200, description="每页数量"),
):
qs = ProxyConfig.all()
if name:
qs = qs.filter(name__icontains=name)
if platform:
qs = qs.filter(platform=platform)
if proxy_type:
qs = qs.filter(proxy_type=proxy_type)
if is_active is not None:
qs = qs.filter(is_active=is_active)
total = await qs.count()
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
data = [
{
"id": item.id,
"name": item.name,
"proxy_type": item.proxy_type,
"platform": item.platform,
"proxy_url": item.proxy_url,
"is_active": item.is_active,
"created_at": item.created_at,
"updated_at": item.updated_at,
}
for item in items
]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@proxy_router.post("/configs")
async def create_proxy_config(payload: Dict[str, Any]):
try:
async with in_transaction():
item = await ProxyConfig.create(
name=payload.get("name"),
proxy_type=payload.get("proxy_type"),
platform=payload.get("platform", "all"),
proxy_url=payload.get("proxy_url"),
is_active=bool(payload.get("is_active", True)),
)
return Success(data={"id": item.id})
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc))
@proxy_router.put("/configs/{config_id}")
async def update_proxy_config(config_id: int, payload: Dict[str, Any]):
item = await ProxyConfig.get_or_none(id=config_id)
if not item:
raise HTTPException(status_code=404, detail="Proxy config not found")
for field in ["name", "proxy_type", "platform", "proxy_url", "is_active"]:
if field in payload:
setattr(item, field, payload[field])
await item.save()
return Success(data={"id": item.id})
@proxy_router.delete("/configs/{config_id}")
async def delete_proxy_config(config_id: int):
item = await ProxyConfig.get_or_none(id=config_id)
if not item:
raise HTTPException(status_code=404, detail="Proxy config not found")
await item.delete()
return Success(data={"id": config_id})
@proxy_router.get("/providers")
async def list_proxy_providers(
name: Optional[str] = Query(None, description="名称"),
platform: Optional[str] = Query(None, description="平台标识"),
mode: Optional[str] = Query(None, description="解析模式"),
page: int = Query(1, ge=1, description="页码"),
page_size: int = Query(10, ge=1, le=200, description="每页数量"),
):
qs = ProxyProvider.all()
if name:
qs = qs.filter(name__icontains=name)
if platform:
qs = qs.filter(platform=platform)
if mode:
qs = qs.filter(mode=mode)
total = await qs.count()
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
data = [
{
"id": item.id,
"name": item.name,
"platform": item.platform,
"mode": item.mode,
"list_path": item.list_path,
"ip_path": item.ip_path,
"port_path": item.port_path,
"username_path": item.username_path,
"password_path": item.password_path,
"pattern": item.pattern,
"template": item.template,
"created_at": item.created_at,
"updated_at": item.updated_at,
}
for item in items
]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@proxy_router.post("/providers")
async def create_proxy_provider(payload: Dict[str, Any]):
try:
async with in_transaction():
item = await ProxyProvider.create(
name=payload.get("name"),
platform=payload.get("platform", "all"),
mode=payload.get("mode", "json"),
list_path=payload.get("list_path"),
ip_path=payload.get("ip_path"),
port_path=payload.get("port_path"),
username_path=payload.get("username_path"),
password_path=payload.get("password_path"),
pattern=payload.get("pattern"),
template=payload.get("template"),
)
return Success(data={"id": item.id})
except Exception as exc:
raise HTTPException(status_code=400, detail=str(exc))
@proxy_router.put("/providers/{provider_id}")
async def update_proxy_provider(provider_id: int, payload: Dict[str, Any]):
item = await ProxyProvider.get_or_none(id=provider_id)
if not item:
raise HTTPException(status_code=404, detail="Proxy provider not found")
for field in [
"name",
"platform",
"mode",
"list_path",
"ip_path",
"port_path",
"username_path",
"password_path",
"pattern",
"template",
]:
if field in payload:
setattr(item, field, payload[field])
await item.save()
return Success(data={"id": item.id})
@proxy_router.delete("/providers/{provider_id}")
async def delete_proxy_provider(provider_id: int):
item = await ProxyProvider.get_or_none(id=provider_id)
if not item:
raise HTTPException(status_code=404, detail="Proxy provider not found")
await item.delete()
return Success(data={"id": provider_id})

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .roles import router
roles_router = APIRouter()
roles_router.include_router(router, tags=["角色模块"])
__all__ = ["roles_router"]

73
app/api/v1/roles/roles.py Normal file
View File

@ -0,0 +1,73 @@
import logging
from fastapi import APIRouter, Query
from fastapi.exceptions import HTTPException
from tortoise.expressions import Q
from app.controllers import role_controller
from app.schemas.base import Success, SuccessExtra
from app.schemas.roles import *
logger = logging.getLogger(__name__)
router = APIRouter()
@router.get("/list", summary="查看角色列表")
async def list_role(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
role_name: str = Query("", description="角色名称,用于查询"),
):
q = Q()
if role_name:
q = Q(name__contains=role_name)
total, role_objs = await role_controller.list(page=page, page_size=page_size, search=q)
data = [await obj.to_dict() for obj in role_objs]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.get("/get", summary="查看角色")
async def get_role(
role_id: int = Query(..., description="角色ID"),
):
role_obj = await role_controller.get(id=role_id)
return Success(data=await role_obj.to_dict())
@router.post("/create", summary="创建角色")
async def create_role(role_in: RoleCreate):
if await role_controller.is_exist(name=role_in.name):
raise HTTPException(
status_code=400,
detail="The role with this rolename already exists in the system.",
)
await role_controller.create(obj_in=role_in)
return Success(msg="Created Successfully")
@router.post("/update", summary="更新角色")
async def update_role(role_in: RoleUpdate):
await role_controller.update(id=role_in.id, obj_in=role_in)
return Success(msg="Updated Successfully")
@router.delete("/delete", summary="删除角色")
async def delete_role(
role_id: int = Query(..., description="角色ID"),
):
await role_controller.remove(id=role_id)
return Success(msg="Deleted Success")
@router.get("/authorized", summary="查看角色权限")
async def get_role_authorized(id: int = Query(..., description="角色ID")):
role_obj = await role_controller.get(id=id)
data = await role_obj.to_dict(m2m=True)
return Success(data=data)
@router.post("/authorized", summary="更新角色权限")
async def update_role_authorized(role_in: RoleUpdateMenusApis):
role_obj = await role_controller.get(id=role_in.id)
await role_controller.update_roles(role=role_obj, menu_ids=role_in.menu_ids, api_infos=role_in.api_infos)
return Success(msg="Updated Successfully")

13
app/api/v1/stats.py Normal file
View File

@ -0,0 +1,13 @@
from fastapi import APIRouter
from app.core.scheduler import stats_job
stats_router = APIRouter(tags=["统计任务"])
@stats_router.get("/trigger", summary="触发统计并上报(无需鉴权)")
async def trigger_stats():
"""触发一次统计任务并执行上报与邮件通知"""
await stats_job()
return {"code": 200, "message": "统计任务已执行并尝试上报"}

View File

@ -0,0 +1,9 @@
from fastapi import APIRouter
from .token import token_router as token_sub_router
token_router = APIRouter()
token_router.include_router(token_sub_router, tags=["Token管理"])
__all__ = ["token_router"]

187
app/api/v1/token/token.py Normal file
View File

@ -0,0 +1,187 @@
import logging
import time
from typing import Any, Dict, Tuple
from fastapi import APIRouter, Query, Body, Path, BackgroundTasks
from fastapi.background import P
from tortoise.expressions import Q
from app.controllers.token import token_controller
from app.schemas.base import Fail, Success, SuccessExtra
from app.schemas.token import BossTokenUpdate,BossTokenCreate
logger = logging.getLogger(__name__)
token_router = APIRouter()
# 简单内存缓存key 为查询参数组合value 为 (缓存时间戳, 响应数据)
_TOKENS_CACHE: Dict[Tuple[Any, Any, int, int], Tuple[float, Dict[str, Any]]] = {}
_CACHE_TTL_SECONDS: int =60
@token_router.get("/tokens", summary="获取Boss Token列表")
async def list_boss_tokens(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
status: int = Query(None, description="状态筛选"),
):
"""获取Boss Token列表"""
from tortoise.expressions import Q
q = Q()
if status is not None:
q &= Q(status=status)
total, token_objs = await token_controller.get_tokens(page=page, page_size=page_size, search=q)
data = [await obj.to_dict() for obj in token_objs]
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@token_router.get("/tokens/{token_id}", summary="获取Boss Token详情")
async def get_boss_token(
token_id: int = Path(..., description="Token ID"),
):
"""获取Boss Token详情"""
token_obj = await token_controller.get_token(token_id)
token_dict = await token_obj.to_dict()
return Success(data=token_dict)
@token_router.post("/tokens", summary="创建Boss Token")
async def create_boss_token(
token_data: BossTokenCreate = Body(..., description="Token数据"),
):
"""创建Boss Token"""
await token_controller.create_token(token_data)
# 清空缓存,确保新数据立即生效
_TOKENS_CACHE.clear()
return Success(msg="创建成功")
@token_router.put("/tokens/{token_id}", summary="更新Boss Token")
async def update_boss_token(
token_id: int = Path(..., description="Token ID"),
token_data: BossTokenUpdate = Body(..., description="Token数据"),
):
"""更新Boss Token"""
await token_controller.update_token(token_id, token_data)
# 清空缓存,确保更新立即生效
_TOKENS_CACHE.clear()
return Success(msg="更新成功")
@token_router.delete("/tokens/{token_id}", summary="删除Boss Token")
async def delete_boss_token(
token_id: int = Path(..., description="Token ID"),
):
"""删除Boss Token"""
await token_controller.delete_token(token_id)
# 清空缓存,确保删除立即生效
_TOKENS_CACHE.clear()
return Success(msg="删除成功")
@token_router.post("/tokens/cache/clear", summary="强制清除Token缓存")
async def clear_token_cache():
"""强制清除Token列表缓存"""
global _TOKENS_CACHE
cache_size = len(_TOKENS_CACHE)
_TOKENS_CACHE.clear()
logger.info(f"手动清除Token缓存清除了 {cache_size} 条缓存数据")
return Success(msg=f"成功清除 {cache_size} 条Token缓存")
from typing import Optional, Dict, Any
from fastapi import APIRouter, Query, HTTPException
from tortoise.transactions import in_transaction
from app.models.token import BossToken
from app.schemas.base import Success
token_router = APIRouter()
@token_router.get("/tokens")
async def list_tokens(
wt2: Optional[str] = Query(None),
mpt: Optional[str] = Query(None),
page: int = Query(1, ge=1),
page_size: int = Query(10, ge=1, le=200),
):
"""获取 BossToken 列表,带两小时内存缓存。
Args:
wt2 (Optional[str]): `wt2` 模糊匹配
mpt (Optional[str]): `mpt` 模糊匹配
page (int): 页码
page_size (int): 每页数量
Returns:
Dict[str, Any]: 响应字典包含 `code``data``total`
"""
cache_key: Tuple[Any, Any, int, int] = (wt2, mpt, page, page_size)
now = time.monotonic()
cached = _TOKENS_CACHE.get(cache_key)
if cached and (now - cached[0] < _CACHE_TTL_SECONDS):
return cached[1]
qs = BossToken.all()
if wt2:
qs = qs.filter(wt2__icontains=wt2)
if mpt:
qs = qs.filter(mpt__icontains=mpt)
total = await qs.count()
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
data = [
{
"id": item.id,
"wt2": item.wt2,
"mpt": item.mpt,
"is_active": item.is_active,
"failed_count": item.failed_count,
"last_used_time": item.last_used_time,
"created_at": item.created_at,
}
for item in items
]
resp: Dict[str, Any] = {"code": 200, "data": data, "total": total}
_TOKENS_CACHE[cache_key] = (now, resp)
return resp
@token_router.post("/tokens")
async def create_token(payload: Dict[str, Any]):
try:
async with in_transaction():
item = await BossToken.create(
wt2=payload.get("wt2"),
mpt=payload.get("mpt"),
is_active=bool(payload.get("is_active", True)),
failed_count=int(payload.get("failed_count", 0)),
last_used_time=payload.get("last_used_time"),
)
_TOKENS_CACHE.clear()
return Success(data={"id": item.id})
except Exception as e:
raise HTTPException(status_code=400, detail=str(e))
@token_router.put("/tokens/{id}")
async def update_token(id: int, payload: Dict[str, Any]):
token_id = id
item = await BossToken.get_or_none(id=token_id)
if not item:
raise HTTPException(status_code=404, detail="Token not found")
for field in ["wt2", "mpt", "is_active", "failed_count", "last_used_time"]:
if field in payload:
setattr(item, field, payload[field])
await item.save()
_TOKENS_CACHE.clear()
return Success(data={"id": item.id})
@token_router.delete("/tokens/{token_id}")
async def delete_token(token_id: int):
item = await BossToken.get_or_none(id=token_id)
if not item:
raise HTTPException(status_code=404, detail="Token not found")
await item.delete()
_TOKENS_CACHE.clear()
return Success(data={"id": token_id})

View File

@ -0,0 +1,8 @@
from fastapi import APIRouter
from .users import router
users_router = APIRouter()
users_router.include_router(router, tags=["用户模块"])
__all__ = ["users_router"]

81
app/api/v1/users/users.py Normal file
View File

@ -0,0 +1,81 @@
import logging
from fastapi import APIRouter, Body, Query
from tortoise.expressions import Q
from app.controllers.dept import dept_controller
from app.controllers.user import user_controller
from app.schemas.base import Fail, Success, SuccessExtra
from app.schemas.users import *
logger = logging.getLogger(__name__)
router = APIRouter()
@router.get("/list", summary="查看用户列表")
async def list_user(
page: int = Query(1, description="页码"),
page_size: int = Query(10, description="每页数量"),
username: str = Query("", description="用户名称,用于搜索"),
email: str = Query("", description="邮箱地址"),
dept_id: int = Query(None, description="部门ID"),
):
q = Q()
if username:
q &= Q(username__contains=username)
if email:
q &= Q(email__contains=email)
if dept_id is not None:
q &= Q(dept_id=dept_id)
total, user_objs = await user_controller.list(page=page, page_size=page_size, search=q)
data = [await obj.to_dict(m2m=True, exclude_fields=["password"]) for obj in user_objs]
for item in data:
dept_id = item.pop("dept_id", None)
item["dept"] = await (await dept_controller.get(id=dept_id)).to_dict() if dept_id else {}
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
@router.get("/get", summary="查看用户")
async def get_user(
user_id: int = Query(..., description="用户ID"),
):
user_obj = await user_controller.get(id=user_id)
user_dict = await user_obj.to_dict(exclude_fields=["password"])
return Success(data=user_dict)
@router.post("/create", summary="创建用户")
async def create_user(
user_in: UserCreate,
):
user = await user_controller.get_by_email(user_in.email)
if user:
return Fail(code=400, msg="The user with this email already exists in the system.")
new_user = await user_controller.create_user(obj_in=user_in)
await user_controller.update_roles(new_user, user_in.role_ids)
return Success(msg="Created Successfully")
@router.post("/update", summary="更新用户")
async def update_user(
user_in: UserUpdate,
):
user = await user_controller.update(id=user_in.id, obj_in=user_in)
await user_controller.update_roles(user, user_in.role_ids)
return Success(msg="Updated Successfully")
@router.delete("/delete", summary="删除用户")
async def delete_user(
user_id: int = Query(..., description="用户ID"),
):
await user_controller.remove(id=user_id)
return Success(msg="Deleted Successfully")
@router.post("/reset_password", summary="重置密码")
async def reset_password(user_id: int = Body(..., description="用户ID", embed=True)):
await user_controller.reset_password(user_id)
return Success(msg="密码已重置为123456")

View File

@ -0,0 +1,2 @@
from .role import role_controller as role_controller
from .user import user_controller as user_controller

45
app/controllers/api.py Normal file
View File

@ -0,0 +1,45 @@
from fastapi.routing import APIRoute
from app.core.crud import CRUDBase
from app.log import logger
from app.models.admin import Api
from app.schemas.apis import ApiCreate, ApiUpdate
class ApiController(CRUDBase[Api, ApiCreate, ApiUpdate]):
def __init__(self):
super().__init__(model=Api)
async def refresh_api(self):
from app import app
# 删除废弃API数据
all_api_list = []
for route in app.routes:
# 只更新有鉴权的API
if isinstance(route, APIRoute) and len(route.dependencies) > 0:
all_api_list.append((list(route.methods)[0], route.path_format))
delete_api = []
for api in await Api.all():
if (api.method, api.path) not in all_api_list:
delete_api.append((api.method, api.path))
for item in delete_api:
method, path = item
logger.debug(f"API Deleted {method} {path}")
await Api.filter(method=method, path=path).delete()
for route in app.routes:
if isinstance(route, APIRoute) and len(route.dependencies) > 0:
method = list(route.methods)[0]
path = route.path_format
summary = route.summary
tags = list(route.tags)[0]
api_obj = await Api.filter(method=method, path=path).first()
if api_obj:
await api_obj.update_from_dict(dict(method=method, path=path, summary=summary, tags=tags)).save()
else:
logger.debug(f"API Created {method} {path}")
await Api.create(**dict(method=method, path=path, summary=summary, tags=tags))
api_controller = ApiController()

View File

@ -0,0 +1,9 @@
from app.core.crud import CRUDBase
from app.models.cleaning import CleaningTask
from app.schemas.cleaning import CleaningTaskCreate, CleaningTaskUpdate
class CleaningController(CRUDBase[CleaningTask, CleaningTaskCreate, CleaningTaskUpdate]):
def __init__(self):
super().__init__(model=CleaningTask)
cleaning_controller = CleaningController()

86
app/controllers/dept.py Normal file
View File

@ -0,0 +1,86 @@
from tortoise.expressions import Q
from tortoise.transactions import atomic
from app.core.crud import CRUDBase
from app.models.admin import Dept, DeptClosure
from app.schemas.depts import DeptCreate, DeptUpdate
class DeptController(CRUDBase[Dept, DeptCreate, DeptUpdate]):
def __init__(self):
super().__init__(model=Dept)
async def get_dept_tree(self, name):
q = Q()
# 获取所有未被软删除的部门
q &= Q(is_deleted=False)
if name:
q &= Q(name__contains=name)
all_depts = await self.model.filter(q).order_by("order")
# 辅助函数,用于递归构建部门树
def build_tree(parent_id):
return [
{
"id": dept.id,
"name": dept.name,
"desc": dept.desc,
"order": dept.order,
"parent_id": dept.parent_id,
"children": build_tree(dept.id), # 递归构建子部门
}
for dept in all_depts
if dept.parent_id == parent_id
]
# 从顶级部门parent_id=0开始构建部门树
dept_tree = build_tree(0)
return dept_tree
async def get_dept_info(self):
pass
async def update_dept_closure(self, obj: Dept):
parent_depts = await DeptClosure.filter(descendant=obj.parent_id)
for i in parent_depts:
pass
dept_closure_objs: list[DeptClosure] = []
# 插入父级关系
for item in parent_depts:
dept_closure_objs.append(DeptClosure(ancestor=item.ancestor, descendant=obj.id, level=item.level + 1))
# 插入自身x
dept_closure_objs.append(DeptClosure(ancestor=obj.id, descendant=obj.id, level=0))
# 创建关系
await DeptClosure.bulk_create(dept_closure_objs)
@atomic()
async def create_dept(self, obj_in: DeptCreate):
# 创建
if obj_in.parent_id != 0:
await self.get(id=obj_in.parent_id)
new_obj = await self.create(obj_in=obj_in)
await self.update_dept_closure(new_obj)
@atomic()
async def update_dept(self, obj_in: DeptUpdate):
dept_obj = await self.get(id=obj_in.id)
# 更新部门关系
if dept_obj.parent_id != obj_in.parent_id:
await DeptClosure.filter(ancestor=dept_obj.id).delete()
await DeptClosure.filter(descendant=dept_obj.id).delete()
await self.update_dept_closure(dept_obj)
# 更新部门信息
dept_obj.update_from_dict(obj_in.model_dump(exclude_unset=True))
await dept_obj.save()
@atomic()
async def delete_dept(self, dept_id: int):
# 删除部门
obj = await self.get(id=dept_id)
obj.is_deleted = True
await obj.save()
# 删除关系
await DeptClosure.filter(descendant=dept_id).delete()
dept_controller = DeptController()

224
app/controllers/job.py Normal file
View File

@ -0,0 +1,224 @@
from typing import Dict, Any, List, Optional
from fastapi import HTTPException, BackgroundTasks
from app.services.job import DataRouterService, DataType, PlatformType
from app.log import logger
from pydantic import BaseModel, Field
class UniversalDataRequest(BaseModel):
"""通用数据存储请求模型"""
data: Dict[str, Any] = Field(..., description="要存储的数据")
data_type: DataType = Field(..., description="数据类型 (job/company)")
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
check_duplicate: bool = Field(True, description="是否检查重复数据")
class BatchDataRequest(BaseModel):
"""批量数据存储请求模型"""
data_list: List[Dict[str, Any]] = Field(..., description="要存储的数据列表")
data_type: DataType = Field(..., description="数据类型 (job/company)")
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
check_duplicate: bool = Field(True, description="是否检查重复数据")
class UniversalDataController:
"""通用数据控制器 - 处理所有平台的数据存储请求"""
def __init__(self, data_router_service: DataRouterService):
self.data_router_service = data_router_service
async def store_single_data(self, request: UniversalDataRequest) -> Dict[str, Any]:
"""存储单条数据"""
try:
# logger.info(f"接收到 {request.platform} {request.data_type} 数据存储请求")
result = await self.data_router_service.store_data(
data=request.data,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
return {
"code": 200 if result["success"] else 400,
"message": result["message"],
"data": result,
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"存储单条数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"数据存储失败: {str(e)}")
async def store_batch_data(self, request: BatchDataRequest) -> Dict[str, Any]:
"""批量存储数据"""
try:
# logger.info(
# f"接收到 {request.platform} {request.data_type} 批量数据存储请求,共 {len(request.data_list)} 条")
result = await self.data_router_service.batch_store_data(
data_list=request.data_list,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
return {
"code": 200,
"message": f"批量处理完成: 成功 {result['success']} 条,失败 {result['failed']} 条,重复 {result['duplicate']}",
"data": result,
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"批量存储数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"批量数据存储失败: {str(e)}")
async def store_single_data_async(self,
background_tasks: BackgroundTasks,
request: UniversalDataRequest) -> Dict[str, Any]:
"""异步存储单条数据"""
try:
# logger.info(f"接收到 {request.platform} {request.data_type} 异步数据存储请求")
# 添加后台任务
background_tasks.add_task(
self._async_store_single_data,
request
)
return {
"code": 202,
"message": "数据已加入异步处理队列",
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"异步存储单条数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"异步数据存储失败: {str(e)}")
async def store_batch_data_async(self,
background_tasks: BackgroundTasks,
request: BatchDataRequest) -> Dict[str, Any]:
"""异步批量存储数据"""
try:
# 打印接收日志
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
logger.info(f"📥 收到批量请求: [{platform_name}] {request.data_type.value} x{len(request.data_list)}")
# 添加后台任务
background_tasks.add_task(
self._async_store_batch_data,
request
)
return {
"code": 202,
"message": f"批量数据已加入异步处理队列,共 {len(request.data_list)}",
"platform": request.platform,
"data_type": request.data_type
}
except Exception as e:
logger.error(f"异步批量存储数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"异步批量数据存储失败: {str(e)}")
async def _async_store_single_data(self, request: UniversalDataRequest):
"""异步存储单条数据的后台任务"""
try:
result = await self.data_router_service.store_data(
data=request.data,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
if result["success"]:
logger.info(f"异步存储 {request.platform} {request.data_type} 数据成功")
else:
logger.warning(f"异步存储 {request.platform} {request.data_type} 数据失败: {result['message']}")
except Exception as e:
logger.error(f"异步存储单条数据后台任务失败: {str(e)}")
async def _async_store_batch_data(self, request: BatchDataRequest):
"""异步批量存储数据的后台任务"""
try:
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
result = await self.data_router_service.batch_store_data(
data_list=request.data_list,
data_type=request.data_type,
platform=request.platform,
check_duplicate=request.check_duplicate
)
logger.info(f"✅ 批量处理完成: [{platform_name}] 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']}")
except Exception as e:
logger.error(f"异步批量存储数据后台任务失败: {str(e)}")
async def query_data(self, platform: PlatformType, data_type: DataType,
page: int = 1, page_size: int = 20) -> Dict[str, Any]:
"""查询数据"""
try:
logger.info(f"查询 {platform} {data_type} 数据,页码: {page}, 页大小: {page_size}")
offset = (page - 1) * page_size
result = await self.data_router_service.query_json_data(
platform=platform,
data_type=data_type,
limit=page_size,
offset=offset
)
return {
"code": 200,
"message": "查询数据成功",
"data": {
"items": result.get("data", []),
"total": result.get("count", 0),
"page": page,
"page_size": page_size
},
"platform": platform,
"data_type": data_type
}
except Exception as e:
logger.error(f"查询数据失败: {str(e)}")
raise HTTPException(status_code=500, detail=f"查询数据失败: {str(e)}")
async def get_supported_platforms(self) -> Dict[str, Any]:
"""获取支持的平台和数据类型"""
return {
"code": 200,
"message": "获取支持的平台和数据类型成功",
"data": {
"platforms": [platform.value for platform in PlatformType],
"data_types": [data_type.value for data_type in DataType],
"platform_duplicate_keys": {
"boss": {
"job": "job_id",
"company": "company_name"
},
"qcwy": {
"job": "job_id + update_date_time",
"company": "company_name"
},
"zhilian": {
"job": "number + first_publish_time",
"company": "company_name"
}
}
}
}
# 创建控制器实例的工厂函数
def create_universal_data_controller(data_router_service: DataRouterService) -> UniversalDataController:
return UniversalDataController(data_router_service)

332
app/controllers/keyword.py Normal file
View File

@ -0,0 +1,332 @@
from datetime import date, datetime
import random
from typing import Any, Dict, List, Type
from tortoise.expressions import Q
from app.core.crud import CRUDBase
from app.models.keyword import BossKeyword, QcwyKeyword, ZhilianKeyword
class KeywordController:
def __init__(self) -> None:
self._model_map: Dict[str, Type] = {
"boss": BossKeyword,
"qcwy": QcwyKeyword,
"zhilian": ZhilianKeyword,
}
async def get_available(self, source: str, limit: int = 1, reserve: bool = True) -> Dict[str, Any]:
"""获取当天未使用的检索条件(城市+岗位)
参数:
source: 平台标识取值为 boss|qcwy|zhilian
limit: 返回数量上限
reserve: 是否立即标记为已使用
返回:
包含 items/total/limit 的字典结构
注意使用原子操作避免并发时的竞态条件
"""
model = self._ensure_model(source)
today = date.today()
now = datetime.now()
# 先统计总数
search = Q(last_requested_date__not=today) | Q(last_requested_date=None)
total = await model.filter(search).count()
items = []
if total > 0 and reserve:
# 使用原子操作:先更新,再查询已更新的记录
# 这样可以避免查询和标记之间的竞态条件
take = max(1, min(limit, total))
try:
# 获取一批未使用的记录ID随机选择
candidate_records = await model.filter(search).offset(
random.randint(0, max(0, total - take))
).limit(take).only('id')
candidate_ids = [r.id for r in candidate_records]
if candidate_ids:
# 原子性地更新这些记录(只更新未使用的)
# 使用数据库的原子UPDATE操作
updated_count = await model.filter(
id__in=candidate_ids
).filter(
Q(last_requested_date__isnull=True) | Q(last_requested_date__not=today)
).update(
last_requested_date=today,
last_requested_at=now
)
# 查询成功更新的记录
if updated_count > 0:
records = await model.filter(
id__in=candidate_ids,
last_requested_date=today
).limit(updated_count)
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
except Exception as e:
# 如果原子操作失败,回退到原来的方法
import logging
logging.warning(f"原子操作失败,回退到原方法: {e}")
take = max(1, min(limit, total))
start = 0 if total == take else random.randint(0, total - take)
records = await model.filter(search).offset(start).limit(take)
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
if reserve:
ids = [r.id for r in records]
await self.mark_used(source, ids)
elif total > 0:
# 如果不需要reserve直接查询
take = max(1, min(limit, total))
start = 0 if total == take else random.randint(0, total - take)
records = await model.filter(search).offset(start).limit(take)
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
return {
"code": 200,
"message": "查询可用检索条件成功",
"data": {
"items": items,
"total": total,
"limit": limit,
},
}
async def get_stats(self, source: str, on_date: date | None = None) -> Dict[str, Any]:
"""统计指定平台在某日期的使用与未使用数量
参数:
source: 平台标识取值为 boss|qcwy|zhilian
on_date: 统计日期不传则为今天
返回:
包含 total/used/unused 的字典结构
"""
model = self._ensure_model(source)
d = on_date or date.today()
total = await model.all().count()
used = await model.filter(last_requested_date=d).count()
unused = max(0, total - used)
return {
"code": 200,
"message": "统计成功",
"data": {
"date": str(d),
"total": total,
"used": used,
"unused": unused,
},
}
async def mark_used(self, source: str, ids: List[int]) -> Dict[str, Any]:
"""将检索条件标记为今日已使用
参数:
source: 平台标识取值为 boss|qcwy|zhilian
ids: 需要标记的记录主键ID列表
返回:
更新结果包括成功条数与日期
"""
model = self._ensure_model(source)
updated = 0
now = datetime.now()
today = date.today()
for rid in ids:
obj = await model.filter(id=rid).first()
if obj is None:
continue
if obj.last_requested_date == today:
continue
obj.last_requested_date = today
obj.last_requested_at = now
await obj.save()
updated += 1
return {
"code": 200,
"message": "状态更新完成",
"data": {
"updated": updated,
"ids": ids,
"date": str(today),
},
}
async def list_keywords(
self,
source: str,
page: int = 1,
page_size: int = 20,
city: str | None = None,
job: str | None = None,
) -> Dict[str, Any]:
"""获取关键词列表
参数:
source: 平台标识
page: 页码
page_size: 每页数量
city: 城市过滤
job: 职位过滤
返回:
包含列表数据和分页信息的字典
"""
model = self._ensure_model(source)
queryset = model.all()
if city:
queryset = queryset.filter(city__icontains=city)
if job:
queryset = queryset.filter(job__icontains=job)
total = await queryset.count()
queryset = queryset.order_by("-id").offset((page - 1) * page_size).limit(page_size)
items = await queryset.values(
"id",
"city",
"job",
"last_requested_date",
"last_requested_at",
"created_at",
"updated_at",
)
return {
"code": 200,
"message": "获取成功",
"data": items,
"total": total,
"page": page,
"page_size": page_size,
}
async def create_keyword(self, source: str, obj_in: Any) -> Dict[str, Any]:
"""创建关键词
参数:
source: 平台标识
obj_in: 创建数据对象
返回:
创建结果
"""
model = self._ensure_model(source)
# Check if already exists
exists = await model.filter(city=obj_in.city, job=obj_in.job).exists()
if exists:
return {"code": 400, "message": "该关键词组合已存在"}
obj = await model.create(**obj_in.model_dump())
data = {
"id": obj.id,
"city": obj.city,
"job": obj.job,
"last_requested_date": obj.last_requested_date,
"last_requested_at": obj.last_requested_at,
"created_at": obj.created_at,
"updated_at": obj.updated_at,
}
return {"code": 200, "message": "创建成功", "data": data}
async def update_keyword(self, source: str, id: int, obj_in: Any) -> Dict[str, Any]:
"""更新关键词
参数:
source: 平台标识
id: 记录ID
obj_in: 更新数据对象
返回:
更新结果
"""
model = self._ensure_model(source)
obj = await model.filter(id=id).first()
if not obj:
return {"code": 404, "message": "记录不存在"}
update_data = obj_in.model_dump(exclude_unset=True)
if update_data:
# Check for duplicates if updating city or job
if "city" in update_data or "job" in update_data:
city = update_data.get("city", obj.city)
job = update_data.get("job", obj.job)
exists = await model.filter(city=city, job=job).exclude(id=id).exists()
if exists:
return {"code": 400, "message": "该关键词组合已存在"}
await obj.update_from_dict(update_data)
await obj.save()
data = {
"id": obj.id,
"city": obj.city,
"job": obj.job,
"last_requested_date": obj.last_requested_date,
"last_requested_at": obj.last_requested_at,
"created_at": obj.created_at,
"updated_at": obj.updated_at,
}
return {"code": 200, "message": "更新成功", "data": data}
async def delete_keyword(self, source: str, id: int) -> Dict[str, Any]:
"""删除关键词
参数:
source: 平台标识
id: 记录ID
返回:
删除结果
"""
model = self._ensure_model(source)
obj = await model.filter(id=id).first()
if not obj:
return {"code": 404, "message": "记录不存在"}
await obj.delete()
return {
"code": 200,
"message": "删除成功",
}
async def get_overview_stats(self) -> Dict[str, Any]:
"""获取所有平台的统计概览
返回:
包含各平台统计数据的字典
"""
today = date.today()
stats = {}
for source, model in self._model_map.items():
total = await model.all().count()
used = await model.filter(last_requested_date=today).count()
stats[source] = {
"total": total,
"used": used,
"unused": max(0, total - used),
}
return {
"code": 200,
"message": "获取概览统计成功",
"data": stats,
}
def _ensure_model(self, source: str) -> Type:
"""根据平台标识返回对应模型类型
参数:
source: 平台标识取值为 boss|qcwy|zhilian
返回:
对应的 Tortoise ORM 模型类型
"""
model = self._model_map.get(source)
if not model:
raise ValueError("不支持的平台标识")
return model

16
app/controllers/menu.py Normal file
View File

@ -0,0 +1,16 @@
from typing import Optional
from app.core.crud import CRUDBase
from app.models.admin import Menu
from app.schemas.menus import MenuCreate, MenuUpdate
class MenuController(CRUDBase[Menu, MenuCreate, MenuUpdate]):
def __init__(self):
super().__init__(model=Menu)
async def get_by_menu_path(self, path: str) -> Optional["Menu"]:
return await self.model.filter(path=path).first()
menu_controller = MenuController()

20
app/controllers/proxy.py Normal file
View File

@ -0,0 +1,20 @@
from app.core.crud import CRUDBase
from app.models.cleaning import ProxyConfig, ProxyProvider
from app.schemas.proxy import ProxyConfigCreate, ProxyConfigUpdate
from app.schemas.proxy_provider import ProxyProviderCreate, ProxyProviderUpdate
class ProxyController(CRUDBase[ProxyConfig, ProxyConfigCreate, ProxyConfigUpdate]):
def __init__(self):
super().__init__(model=ProxyConfig)
proxy_controller = ProxyController()
class ProxyProviderController(CRUDBase[ProxyProvider, ProxyProviderCreate, ProxyProviderUpdate]):
def __init__(self):
super().__init__(model=ProxyProvider)
proxy_provider_controller = ProxyProviderController()

27
app/controllers/role.py Normal file
View File

@ -0,0 +1,27 @@
from typing import List
from app.core.crud import CRUDBase
from app.models.admin import Api, Menu, Role
from app.schemas.roles import RoleCreate, RoleUpdate
class RoleController(CRUDBase[Role, RoleCreate, RoleUpdate]):
def __init__(self):
super().__init__(model=Role)
async def is_exist(self, name: str) -> bool:
return await self.model.filter(name=name).exists()
async def update_roles(self, role: Role, menu_ids: List[int], api_infos: List[dict]) -> None:
await role.menus.clear()
for menu_id in menu_ids:
menu_obj = await Menu.filter(id=menu_id).first()
await role.menus.add(menu_obj)
await role.apis.clear()
for item in api_infos:
api_obj = await Api.filter(path=item.get("path"), method=item.get("method")).first()
await role.apis.add(api_obj)
role_controller = RoleController()

34
app/controllers/token.py Normal file
View File

@ -0,0 +1,34 @@
from app.core.crud import CRUDBase
from app.models.token import BossToken
from tortoise.expressions import Q
class BossPlatform:
"""Boss直聘平台操作类"""
def __init__(self):
self.token_crud = CRUDBase(model=BossToken)
async def get_tokens(self, page: int = 1, page_size: int = 10, search: Q = None):
"""获取Boss Token列表"""
return await self.token_crud.list(page=page, page_size=page_size, search=search)
async def get_token(self, token_id: int):
"""获取Boss Token详情"""
return await self.token_crud.get(id=token_id)
async def create_token(self, obj_in: dict):
"""创建Boss Token"""
return await self.token_crud.create(obj_in)
async def update_token(self, token_id: int, obj_in: dict):
"""更新Boss Token"""
return await self.token_crud.update(id=token_id, obj_in=obj_in)
async def delete_token(self, token_id: int):
"""删除Boss Token"""
return await self.token_crud.remove(id=token_id)
# 创建token_controller实例供API路由使用
token_controller = BossPlatform()

63
app/controllers/user.py Normal file
View File

@ -0,0 +1,63 @@
from datetime import datetime
from typing import List, Optional
from fastapi.exceptions import HTTPException
from app.core.crud import CRUDBase
from app.models.admin import User
from app.schemas.login import CredentialsSchema
from app.schemas.users import UserCreate, UserUpdate
from app.utils.password import get_password_hash, verify_password
from .role import role_controller
class UserController(CRUDBase[User, UserCreate, UserUpdate]):
def __init__(self):
super().__init__(model=User)
async def get_by_email(self, email: str) -> Optional[User]:
return await self.model.filter(email=email).first()
async def get_by_username(self, username: str) -> Optional[User]:
return await self.model.filter(username=username).first()
async def create_user(self, obj_in: UserCreate) -> User:
obj_in.password = get_password_hash(password=obj_in.password)
obj = await self.create(obj_in)
return obj
async def update_last_login(self, id: int) -> None:
user = await self.model.get(id=id)
user.last_login = datetime.now()
await user.save()
async def authenticate(self, credentials: CredentialsSchema) -> Optional["User"]:
user = await self.model.filter(username=credentials.username).first()
if not user:
raise HTTPException(status_code=400, detail="无效的用户名")
try:
verified = verify_password(credentials.password, user.password)
except Exception:
raise HTTPException(status_code=500, detail="密码校验失败,请联系管理员安装或修复加密依赖")
if not verified:
raise HTTPException(status_code=400, detail="密码错误!")
if not user.is_active:
raise HTTPException(status_code=400, detail="用户已被禁用")
return user
async def update_roles(self, user: User, role_ids: List[int]) -> None:
await user.roles.clear()
for role_id in role_ids:
role_obj = await role_controller.get(id=role_id)
await user.roles.add(role_obj)
async def reset_password(self, user_id: int):
user_obj = await self.get(id=user_id)
if user_obj.is_superuser:
raise HTTPException(status_code=403, detail="不允许重置超级管理员密码")
user_obj.password = get_password_hash(password="123456")
await user_obj.save()
user_controller = UserController()

View File

@ -0,0 +1,203 @@
import time
import os
from typing import Dict, Any, Optional, List, Tuple
import random
class IPStrategyConfig:
def __init__(self,
response_time_threshold_sec: int = 5,
proxy_failure_threshold: int = 3,
local_cooldown_sec: int = 1800,
local_failure_threshold: int = 2):
"""IP策略配置
Args:
response_time_threshold_sec (int): 单次请求耗时阈值秒
proxy_failure_threshold (int): 同一代理连续失败触发切换阈值
local_cooldown_sec (int): 本机IP使用冷却时间秒
local_failure_threshold (int): 本机连续失败阈值超过后回到代理池
Returns:
None
"""
self.response_time_threshold_sec = response_time_threshold_sec
self.proxy_failure_threshold = proxy_failure_threshold
self.local_cooldown_sec = local_cooldown_sec
self.local_failure_threshold = local_failure_threshold
def update(self, updates: Dict[str, Any]) -> None:
"""动态更新配置"""
for k, v in updates.items():
if hasattr(self, k):
setattr(self, k, v)
class IPAnomalyDetector:
def __init__(self, cfg: IPStrategyConfig):
"""异常检测器
Args:
cfg (IPStrategyConfig): 策略配置
Returns:
None
"""
self.cfg = cfg
def detect(self, status_code: Optional[int], elapsed_sec: float, resp_json: Optional[Dict], error_text: str = "") -> Optional[str]:
"""检测是否存在IP异常
Args:
status_code (Optional[int]): 响应HTTP状态码异常时可能为None
elapsed_sec (float): 响应耗时秒
resp_json (Optional[Dict]): 响应体JSON
error_text (str): 异常文本
Returns:
Optional[str]: 异常原因标识字符串无异常返回None
"""
if status_code in (403, 429, 407):
return f"http_{status_code}"
if elapsed_sec > self.cfg.response_time_threshold_sec:
return "slow_response"
if resp_json:
msg = str(resp_json.get("message", ""))
code = resp_json.get("code")
if code == 35 or ("IP地址存在异常" in msg or ("IP" in msg and "异常" in msg)):
return "ip_banned"
if error_text and ("IP" in error_text and "异常" in error_text):
return "ip_banned"
return None
class SmartIPManager:
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]], cfg: IPStrategyConfig):
"""智能IP管理器
Args:
proxy_pool (Optional[List[Dict[str,str]]]): 代理池列表元素为requests兼容代理字典
cfg (IPStrategyConfig): 策略配置
Returns:
None
"""
self.cfg = cfg
self.proxy_pool: List[Dict[str, str]] = proxy_pool or []
self.eliminated: set = set()
self.current_mode: str = 'proxy' if self.proxy_pool else 'local'
self.current_index: int = 0
self.proxy_failures_current: int = 0
self.local_failures: int = 0
self.last_local_use_time: float = 0.0
self.local_disabled_until: float = 0.0
def current_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
"""返回当前路由模式和代理配置"""
if self.current_mode == 'proxy' and self.proxy_pool:
return 'proxy', self.proxy_pool[self.current_index]
return 'local', None
def mark_success(self) -> None:
"""请求成功后重置失败计数"""
if self.current_mode == 'proxy':
self.proxy_failures_current = 0
else:
self.local_failures = 0
def mark_failure(self, reason: str = "") -> None:
"""请求失败后更新失败计数与淘汰状态"""
if self.current_mode == 'proxy':
self.proxy_failures_current += 1
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
self.eliminated.add(self.current_index)
else:
self.local_failures += 1
def select_next_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
"""选择下一个路由(代理或本机),避免无限本机循环"""
now = time.monotonic()
if self.current_mode == 'proxy':
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
if self._local_available(now):
self.current_mode = 'local'
self.last_local_use_time = now
self.proxy_failures_current = 0
return 'local', None
next_idx = self._next_proxy_index()
if next_idx is not None:
self.current_index = next_idx
self.proxy_failures_current = 0
return 'proxy', self.proxy_pool[self.current_index]
self.current_mode = 'local'
self.last_local_use_time = now
self.proxy_failures_current = 0
return 'local', None
if self.proxy_pool:
return 'proxy', self.proxy_pool[self.current_index]
self.current_mode = 'local'
return 'local', None
else:
if self.local_failures >= self.cfg.local_failure_threshold:
next_idx = self._next_proxy_index()
if next_idx is not None:
self.current_mode = 'proxy'
self.current_index = next_idx
self.local_failures = 0
return 'proxy', self.proxy_pool[self.current_index]
return 'local', None
def _next_proxy_index(self) -> Optional[int]:
"""查找下一个未被淘汰的代理索引"""
if not self.proxy_pool:
return None
n = len(self.proxy_pool)
for step in range(1, n + 1):
cand = (self.current_index + step) % n
if cand not in self.eliminated:
return cand
return None
def _local_available(self, now: float) -> bool:
"""本机是否可用(冷却与禁用窗口判断)"""
if now < self.local_disabled_until:
return False
return (now - self.last_local_use_time) >= self.cfg.local_cooldown_sec
def disable_local_temporarily(self, seconds: int) -> None:
"""临时禁用本机IP"""
self.local_disabled_until = time.monotonic() + max(0, seconds)
def manual_switch_to_proxy(self, index: int) -> None:
"""人工指定代理索引"""
if 0 <= index < len(self.proxy_pool) and index not in self.eliminated:
self.current_mode = 'proxy'
self.current_index = index
self.proxy_failures_current = 0
def enable_local(self) -> None:
"""重新允许本机IP"""
self.local_disabled_until = 0.0
def generate_boss_trace_id() -> str:
"""生成Boss直聘的trace_id
基于Boss直聘官方算法:
1. 获取当前时间戳的16进制表示取后6位
2. 生成10位随机字符串包含数字小写字母大写字母
3. 拼接为 F-{timestamp_hex}{random_string} 格式
"""
# 获取当前时间戳的16进制表示取后6位
timestamp_hex = hex(int(time.time() * 1000))[2:][-6:]
# 字符集:数字 + 小写字母 + 大写字母
charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
# 生成10位随机字符串
random_string = ''.join(random.choice(charset) for _ in range(10))
# 拼接最终的traceid
trace_id = f"F-{timestamp_hex}{random_string}"
return trace_id
def generate_token() -> str:
chars = "0123456789abcdef"
return ''.join(random.choice(chars) for _ in range(32))

View File

@ -0,0 +1,32 @@
import hashlib
import hmac
import json
from typing import Dict, Any
class SignatureGenerator:
"""签名生成器"""
def __init__(self, sign_key: str):
self.sign_key = sign_key
def hmac_sha256(self, message: str) -> str:
"""使用HMAC-SHA256算法生成签名"""
key_bytes = self.sign_key.encode('utf-8')
message_bytes = message.encode('utf-8')
return hmac.new(key_bytes, message_bytes, hashlib.sha256).hexdigest()
def generate_signature(self, url_path: str, data: Dict[str, Any] = None) -> str:
"""生成请求签名"""
sign_message = url_path
if data:
# 将布尔值转换为字符串
data_copy = data.copy()
for key, value in data_copy.items():
if isinstance(value, bool):
data_copy[key] = "true" if value else "false"
# 添加请求体到签名消息
sign_message += json.dumps(data_copy, ensure_ascii=False, separators=(',', ':'))
return self.hmac_sha256(sign_message)

31
app/core/bgtask.py Normal file
View File

@ -0,0 +1,31 @@
from starlette.background import BackgroundTasks
from .ctx import CTX_BG_TASKS
class BgTasks:
"""后台任务统一管理"""
@classmethod
async def init_bg_tasks_obj(cls):
"""实例化后台任务,并设置到上下文"""
bg_tasks = BackgroundTasks()
CTX_BG_TASKS.set(bg_tasks)
@classmethod
async def get_bg_tasks_obj(cls):
"""从上下文中获取后台任务实例"""
return CTX_BG_TASKS.get()
@classmethod
async def add_task(cls, func, *args, **kwargs):
"""添加后台任务"""
bg_tasks = await cls.get_bg_tasks_obj()
bg_tasks.add_task(func, *args, **kwargs)
@classmethod
async def execute_tasks(cls):
"""执行后台任务,一般是请求结果返回之后执行"""
bg_tasks = await cls.get_bg_tasks_obj()
if bg_tasks.tasks:
await bg_tasks()

56
app/core/clickhouse.py Normal file
View File

@ -0,0 +1,56 @@
from clickhouse_connect import get_async_client
from clickhouse_connect.driver import AsyncClient as AsyncClickHouseClient
from app.settings.config import settings
import urllib3
from typing import Any, Dict
async def get_clickhouse_client() -> AsyncClickHouseClient:
"""获取ClickHouse异步客户端"""
# 创建自定义连接池管理器适配多worker模式
# 考虑到多worker环境每个worker的连接池应该适中
# maxsize参数控制每个主机的连接池大小block=True防止连接池溢出
pool_mgr = urllib3.PoolManager(
num_pools=2, # 连接池数量减少以适配多worker
maxsize=5, # 每个连接池的最大连接数每个worker最多10个连接
block=True # 当连接池满时阻塞而不是创建新连接
)
return await get_async_client(
host=settings.CLICKHOUSE_HOST,
username=settings.CLICKHOUSE_USER,
password=settings.CLICKHOUSE_PASS,
database=settings.CLICKHOUSE_DB,
port=settings.CLICKHOUSE_PORT,
pool_mgr=pool_mgr,
connect_timeout=30,
send_receive_timeout=120
)
class ClickHouseManager:
"""ClickHouse连接管理器"""
def __init__(self):
self._client: AsyncClickHouseClient = None
async def get_client(self) -> AsyncClickHouseClient:
"""获取或创建ClickHouse客户端"""
if self._client is None:
self._client = await get_clickhouse_client()
return self._client
async def execute(self, query: str, parameters: Dict[str, Any] = None):
"""执行SQL查询"""
client = await self.get_client()
return await client.query(query, parameters=parameters)
async def close(self):
"""关闭ClickHouse连接"""
if self._client:
await self._client.close()
self._client = None
# 全局ClickHouse管理器实例
clickhouse_manager = ClickHouseManager()

240
app/core/clickhouse_init.py Normal file
View File

@ -0,0 +1,240 @@
from clickhouse_connect.driver import AsyncClient
from app.log import logger
class ClickHouseInitializer:
"""ClickHouse数据库初始化器"""
def __init__(self, client: AsyncClient):
self.client = client
async def create_boss_job_json_table(self):
"""创建BOSS招聘职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.boss_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
job_id String DEFAULT '', -- BOSS平台去重字段jobBaseInfoVO.jobId
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("BOSS职位JSON数据表 boss_job 创建成功")
except Exception as e:
logger.error(f"创建BOSS职位JSON数据表失败: {e}")
raise
async def create_boss_company_json_table(self):
"""创建BOSS招聘公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.boss_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("BOSS公司JSON数据表 boss_company 创建成功")
except Exception as e:
logger.error(f"创建BOSS公司JSON数据表失败: {e}")
raise
async def create_qcwy_job_json_table(self):
"""创建前程无忧职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
job_id String DEFAULT '', -- QCWY平台去重字段jobId
update_date_time String DEFAULT '', -- QCWY平台去重字段updateDateTime
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("前程无忧职位JSON数据表 qcwy_job 创建成功")
except Exception as e:
logger.error(f"创建前程无忧职位JSON数据表失败: {e}")
raise
async def create_qcwy_company_json_table(self):
"""创建前程无忧公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("前程无忧公司JSON数据表 qcwy_company 创建成功")
except Exception as e:
logger.error(f"创建前程无忧公司JSON数据表失败: {e}")
raise
async def create_zhilian_job_json_table(self):
"""创建智联招聘职位JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
number String DEFAULT '', -- 智联平台去重字段number
first_publish_time String DEFAULT '', -- 智联平台去重字段firstPublishTime
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("智联招聘职位JSON数据表 zhilian_job 创建成功")
except Exception as e:
logger.error(f"创建智联招聘职位JSON数据表失败: {e}")
raise
async def create_zhilian_company_json_table(self):
"""创建智联招聘公司JSON存储表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
id UInt64 DEFAULT 0,
json_data String DEFAULT '', -- 原始JSON数据
company_name String DEFAULT '', -- 公司名称去重字段
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now()
) ENGINE = MergeTree()
ORDER BY created_at
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("智联招聘公司JSON数据表 zhilian_company 创建成功")
except Exception as e:
logger.error(f"创建智联招聘公司JSON数据表失败: {e}")
raise
async def create_pending_company_table(self):
"""创建待处理公司表"""
create_table_sql = """
CREATE TABLE IF NOT EXISTS job_data.pending_company (
source String,
company_id String,
company_name String DEFAULT '',
status String DEFAULT 'pending',
error_msg String DEFAULT '',
created_at DateTime DEFAULT now(),
updated_at DateTime DEFAULT now(),
version UInt64 DEFAULT 1
) ENGINE = ReplacingMergeTree(version)
ORDER BY (source, company_id)
SETTINGS index_granularity = 8192;
"""
try:
await self.client.command(create_table_sql)
logger.info("待处理公司表 pending_company 创建成功")
except Exception as e:
logger.error(f"创建待处理公司表失败: {e}")
raise
async def create_job_analytics_view(self):
"""创建统一的招聘数据分析视图"""
create_view_sql = """
CREATE VIEW IF NOT EXISTS job_data.job_analytics AS
SELECT
'boss' as source,
job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'brandName') as company_name,
JSONExtractString(json_data, 'salaryDesc') as salary_text,
0.0 as salary_min,
0.0 as salary_max,
JSONExtractString(json_data, 'cityName') as city,
JSONExtractString(json_data, 'experienceName') as experience_required,
JSONExtractString(json_data, 'degreeName') as education,
created_at
FROM job_data.boss_job
UNION ALL
SELECT
'qcwy' as source,
job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workYear') as experience_required,
JSONExtractString(json_data, 'degree') as education,
created_at
FROM job_data.qcwy_job
UNION ALL
SELECT
'zhilian' as source,
number as job_id,
JSONExtractString(json_data, 'jobName') as position_name,
JSONExtractString(json_data, 'companyName') as company_name,
JSONExtractString(json_data, 'salary60') as salary_text,
0.0, 0.0,
JSONExtractString(json_data, 'workCity') as city,
JSONExtractString(json_data, 'workingExp') as experience_required,
JSONExtractString(json_data, 'education') as education,
created_at
FROM job_data.zhilian_job
"""
try:
await self.client.command(create_view_sql)
logger.info("招聘数据分析视图 job_analytics 创建成功")
except Exception as e:
logger.error(f"创建招聘数据分析视图失败: {e}")
raise
async def initialize_all_tables(self):
"""初始化所有表"""
logger.info("开始初始化 ClickHouse 数据库表...")
try:
# 创建BOSS招聘JSON表
await self.create_boss_job_json_table()
await self.create_boss_company_json_table()
# 创建前程无忧JSON表
await self.create_qcwy_job_json_table()
await self.create_qcwy_company_json_table()
# 创建智联招聘JSON表
await self.create_zhilian_job_json_table()
await self.create_zhilian_company_json_table()
# 创建待处理公司表
await self.create_pending_company_table()
# 创建统一分析视图
await self.create_job_analytics_view()
logger.info("ClickHouse 数据库表初始化完成")
except Exception as e:
logger.error(f"ClickHouse 数据库初始化失败: {e}")
raise

49
app/core/crud.py Normal file
View File

@ -0,0 +1,49 @@
from typing import Any, Dict, Generic, List, NewType, Tuple, Type, TypeVar, Union
from pydantic import BaseModel
from tortoise.expressions import Q
from tortoise.models import Model
Total = NewType("Total", int)
ModelType = TypeVar("ModelType", bound=Model)
CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel)
UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel)
class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]):
def __init__(self, model: Type[ModelType]):
self.model = model
async def get(self, id: int) -> ModelType:
return await self.model.get(id=id)
async def list(self, page: int, page_size: int, search: Q = Q(), order: list = []) -> Tuple[Total, List[ModelType]]:
query = self.model.filter(search)
return await query.count(), await query.offset((page - 1) * page_size).limit(page_size).order_by(*order)
async def create(self, obj_in: CreateSchemaType) -> ModelType:
if isinstance(obj_in, Dict):
obj_dict = obj_in
else:
obj_dict = obj_in.model_dump()
obj = self.model(**obj_dict)
await obj.save()
return obj
async def update(self, id: int, obj_in: Union[UpdateSchemaType, Dict[str, Any]]) -> ModelType:
if isinstance(obj_in, Dict):
obj_dict = obj_in
else:
obj_dict = obj_in.model_dump(exclude_unset=True, exclude={"id"})
obj = await self.get(id=id)
obj = obj.update_from_dict(obj_dict)
await obj.save()
return obj
async def remove(self, id: int) -> None:
obj = await self.get(id=id)
await obj.delete()
# 加一个 filter_one 方法,用于根据条件查询单条数据
async def filter_one(self, **kwargs) -> ModelType:
return await self.model.filter(**kwargs).first()

6
app/core/ctx.py Normal file
View File

@ -0,0 +1,6 @@
import contextvars
from starlette.background import BackgroundTasks
CTX_USER_ID: contextvars.ContextVar[int] = contextvars.ContextVar("user_id", default=0)
CTX_BG_TASKS: contextvars.ContextVar[BackgroundTasks] = contextvars.ContextVar("bg_task", default=None)

64
app/core/dependency.py Normal file
View File

@ -0,0 +1,64 @@
from typing import Optional, Dict, Any
import jwt
from fastapi import Depends, Header, HTTPException, Request
from app.core.ctx import CTX_USER_ID
from app.models import Role, User
from app.settings import settings
def get_list_params(skip: int = 0, limit: int = 10, filters: Dict[str, Any] = None, sort_by: str = None, sort_order: str = "desc"):
"""获取列表查询参数"""
from app.core.crud import ListParams
return ListParams(
skip=skip,
limit=limit,
filters=filters or {},
sort_by=sort_by,
sort_order=sort_order
)
class AuthControl:
@classmethod
async def is_authed(cls, token: str = Header(..., description="token验证")) -> Optional["User"]:
try:
if token == "dev":
user = await User.filter().first()
user_id = user.id
else:
decode_data = jwt.decode(token, settings.SECRET_KEY, algorithms=settings.JWT_ALGORITHM)
user_id = decode_data.get("user_id")
user = await User.filter(id=user_id).first()
if not user:
raise HTTPException(status_code=401, detail="Authentication failed")
CTX_USER_ID.set(int(user_id))
return user
except jwt.DecodeError:
raise HTTPException(status_code=401, detail="无效的Token")
except jwt.ExpiredSignatureError:
raise HTTPException(status_code=401, detail="登录已过期")
except Exception as e:
raise HTTPException(status_code=500, detail=f"{repr(e)}")
class PermissionControl:
@classmethod
async def has_permission(cls, request: Request, current_user: User = Depends(AuthControl.is_authed)) -> None:
if current_user.is_superuser:
return
method = request.method
path = request.url.path
roles: list[Role] = await current_user.roles
if not roles:
raise HTTPException(status_code=403, detail="The user is not bound to a role")
apis = [await role.apis for role in roles]
permission_apis = list(set((api.method, api.path) for api in sum(apis, [])))
# path = "/api/v1/auth/userinfo"
# method = "GET"
if (method, path) not in permission_apis:
raise HTTPException(status_code=403, detail=f"Permission denied method:{method} path:{path}")
DependAuth = Depends(AuthControl.is_authed)
DependPermission = Depends(PermissionControl.has_permission)

55
app/core/exceptions.py Normal file
View File

@ -0,0 +1,55 @@
from fastapi.exceptions import (
HTTPException,
RequestValidationError,
ResponseValidationError,
)
from fastapi.requests import Request
from fastapi.responses import JSONResponse
from tortoise.exceptions import DoesNotExist, IntegrityError
from app.log import logger
class SettingNotFound(Exception):
pass
async def DoesNotExistHandle(req: Request, exc: DoesNotExist) -> JSONResponse:
content = dict(
code=404,
msg=f"Object has not found, exc: {exc}, query_params: {req.query_params}",
)
return JSONResponse(content=content, status_code=404)
async def IntegrityHandle(_: Request, exc: IntegrityError) -> JSONResponse:
content = dict(
code=500,
msg=f"IntegrityError{exc}",
)
return JSONResponse(content=content, status_code=500)
async def HttpExcHandle(_: Request, exc: HTTPException) -> JSONResponse:
content = dict(code=exc.status_code, msg=exc.detail, data=None)
return JSONResponse(content=content, status_code=exc.status_code)
async def RequestValidationHandle(req: Request, exc: RequestValidationError) -> JSONResponse:
try:
body_bytes = await req.body()
body_text = body_bytes.decode("utf-8", errors="replace")
if len(body_text) > 10000:
body_text = body_text[:10000] + "..."
logger.error(
f"422 RequestValidationError path={req.url.path} errors={exc.errors()} body={body_text}"
)
except Exception as e:
logger.error(f"422 RequestValidationError logging failed: {e}")
content = dict(code=422, msg=f"RequestValidationError, {exc}")
return JSONResponse(content=content, status_code=422)
async def ResponseValidationHandle(_: Request, exc: ResponseValidationError) -> JSONResponse:
content = dict(code=500, msg=f"ResponseValidationError, {exc}")
return JSONResponse(content=content, status_code=500)

351
app/core/init_app.py Normal file
View File

@ -0,0 +1,351 @@
import shutil
from aerich import Command
from fastapi import FastAPI
from fastapi.middleware import Middleware
from fastapi.middleware.cors import CORSMiddleware
from tortoise.expressions import Q
from app.api import api_router
from app.controllers.api import api_controller
from app.controllers.user import UserCreate, user_controller
from app.core.exceptions import (
DoesNotExist,
DoesNotExistHandle,
HTTPException,
HttpExcHandle,
IntegrityError,
IntegrityHandle,
RequestValidationError,
RequestValidationHandle,
ResponseValidationError,
ResponseValidationHandle,
)
from app.log import logger
from app.models.admin import Api, Menu, Role
from app.schemas.menus import MenuType
from app.settings.config import settings
from app.core.clickhouse import clickhouse_manager
from app.core.clickhouse_init import ClickHouseInitializer
from .middlewares import BackGroundTaskMiddleware, HttpAuditLogMiddleware
from .ip_tracking import IpTrackingMiddleware
def make_middlewares():
middleware = [
Middleware(
CORSMiddleware,
allow_origins=settings.CORS_ORIGINS,
allow_credentials=settings.CORS_ALLOW_CREDENTIALS,
allow_methods=settings.CORS_ALLOW_METHODS,
allow_headers=settings.CORS_ALLOW_HEADERS,
),
Middleware(BackGroundTaskMiddleware),
Middleware(
HttpAuditLogMiddleware,
methods=["GET", "POST", "PUT", "DELETE"],
exclude_paths=[
"/api/v1/base/access_token",
"/docs",
"/openapi.json",
],
),
Middleware(IpTrackingMiddleware),
]
return middleware
def register_exceptions(app: FastAPI):
app.add_exception_handler(DoesNotExist, DoesNotExistHandle)
app.add_exception_handler(HTTPException, HttpExcHandle)
app.add_exception_handler(IntegrityError, IntegrityHandle)
app.add_exception_handler(RequestValidationError, RequestValidationHandle)
app.add_exception_handler(ResponseValidationError, ResponseValidationHandle)
def register_routers(app: FastAPI, prefix: str = "/api"):
app.include_router(api_router, prefix=prefix)
async def init_superuser():
user = await user_controller.model.exists()
if not user:
await user_controller.create_user(
UserCreate(
username="admin",
email="admin@admin.com",
password="123456",
is_active=True,
is_superuser=True,
)
)
async def init_menus():
menus = await Menu.exists()
if not menus:
parent_menu = await Menu.create(
menu_type=MenuType.CATALOG,
name="系统管理",
path="/system",
order=1,
parent_id=0,
icon="carbon:gui-management",
is_hidden=False,
component="Layout",
keepalive=False,
redirect="/system/user",
)
children_menu = [
Menu(
menu_type=MenuType.MENU,
name="用户管理",
path="user",
order=1,
parent_id=parent_menu.id,
icon="material-symbols:person-outline-rounded",
is_hidden=False,
component="/system/user",
keepalive=False,
),
Menu(
menu_type=MenuType.MENU,
name="角色管理",
path="role",
order=2,
parent_id=parent_menu.id,
icon="carbon:user-role",
is_hidden=False,
component="/system/role",
keepalive=False,
),
Menu(
menu_type=MenuType.MENU,
name="菜单管理",
path="menu",
order=3,
parent_id=parent_menu.id,
icon="material-symbols:list-alt-outline",
is_hidden=False,
component="/system/menu",
keepalive=False,
),
Menu(
menu_type=MenuType.MENU,
name="API管理",
path="api",
order=4,
parent_id=parent_menu.id,
icon="ant-design:api-outlined",
is_hidden=False,
component="/system/api",
keepalive=False,
),
Menu(
menu_type=MenuType.MENU,
name="部门管理",
path="dept",
order=5,
parent_id=parent_menu.id,
icon="mingcute:department-line",
is_hidden=False,
component="/system/dept",
keepalive=False,
),
Menu(
menu_type=MenuType.MENU,
name="审计日志",
path="auditlog",
order=6,
parent_id=parent_menu.id,
icon="ph:clipboard-text-bold",
is_hidden=False,
component="/system/auditlog",
keepalive=False,
),
]
await Menu.bulk_create(children_menu)
# 创建招聘数据管理菜单
recruitment_menu = await Menu.create(
menu_type=MenuType.CATALOG,
name="招聘数据管理",
path="/recruitment",
order=2,
parent_id=0,
icon="mdi:briefcase-search",
is_hidden=False,
component="Layout",
keepalive=False,
redirect="/recruitment/qcwy",
)
recruitment_children = [
Menu(
menu_type=MenuType.MENU,
name="前程无忧",
path="qcwy",
order=1,
parent_id=recruitment_menu.id,
icon="mdi:alpha-q-box",
is_hidden=False,
component="/recruitment/qcwy",
keepalive=True,
),
Menu(
menu_type=MenuType.MENU,
name="智联招聘",
path="zhilian",
order=2,
parent_id=recruitment_menu.id,
icon="mdi:alpha-z-box",
is_hidden=False,
component="/recruitment/zhilian",
keepalive=True,
),
Menu(
menu_type=MenuType.MENU,
name="Boss直聘",
path="boss",
order=3,
parent_id=recruitment_menu.id,
icon="mdi:alpha-b-box",
is_hidden=False,
component="/recruitment/boss",
keepalive=True,
),
]
await Menu.bulk_create(recruitment_children)
# 创建数据清理菜单
cleaning_menu = await Menu.create(
menu_type=MenuType.CATALOG,
name="数据清理",
path="/cleaning",
order=3,
parent_id=0,
icon="mdi:database-refresh",
is_hidden=False,
component="Layout",
keepalive=False,
redirect="/cleaning/targeted",
)
cleaning_children = [
Menu(
menu_type=MenuType.MENU,
name="定向数据",
path="targeted",
order=1,
parent_id=cleaning_menu.id,
icon="mdi:filter-target",
is_hidden=False,
component="/cleaning/index",
keepalive=True,
),
Menu(
menu_type=MenuType.MENU,
name="清洗监控",
path="monitor",
order=2,
parent_id=cleaning_menu.id,
icon="mdi:monitor-dashboard",
is_hidden=False,
component="/cleaning/monitor",
keepalive=True,
),
]
await Menu.bulk_create(cleaning_children)
async def init_apis():
apis = await api_controller.model.exists()
if not apis:
await api_controller.refresh_api()
async def init_db():
"""执行数据库迁移(受环境开关与并发保护控制)"""
command = Command(tortoise_config=settings.TORTOISE_ORM)
await command.init_db(safe=True)
await command.init()
try:
await command.migrate()
except AttributeError:
logger.warning("unable to retrieve model history from database, model history will be created from scratch")
shutil.rmtree("migrations")
await command.init_db(safe=True)
await command.upgrade(run_in_transaction=True)
async def init_roles():
roles = await Role.exists()
if not roles:
admin_role = await Role.create(
name="管理员",
desc="管理员角色",
)
user_role = await Role.create(
name="普通用户",
desc="普通用户角色",
)
# 分配所有API给管理员角色
all_apis = await Api.all()
await admin_role.apis.add(*all_apis)
# 分配所有菜单给管理员和普通用户
all_menus = await Menu.all()
await admin_role.menus.add(*all_menus)
await user_role.menus.add(*all_menus)
# 为普通用户分配基本API
basic_apis = await Api.filter(Q(method__in=["GET"]) | Q(tags="基础模块"))
await user_role.apis.add(*basic_apis)
async def init_clickhouse():
"""初始化ClickHouse数据库若未配置则跳过"""
host = settings.CLICKHOUSE_HOST or ""
if not host:
return
try:
client = await clickhouse_manager.get_client()
initializer = ClickHouseInitializer(client)
await initializer.initialize_all_tables()
logger.info("ClickHouse初始化完成")
except Exception as e:
logger.error(f"ClickHouse初始化失败: {e}")
async def init_data():
"""应用启动数据初始化:受环境变量控制并在多进程下只执行一次"""
should_migrate = settings.RUN_MIGRATIONS_ON_STARTUP
should_seed = settings.INITIALIZE_SEED_DATA_ON_STARTUP
lock_dir = ".startup_lock"
acquired = False
try:
# 简单文件锁,避免多 worker 并发执行
import os
os.mkdir(lock_dir)
acquired = True
except Exception:
acquired = False
if should_migrate and acquired:
await init_db()
if should_seed and acquired:
await init_superuser()
await init_menus()
await init_apis()
await init_roles()
# ClickHouse 初始化为可选,且不影响主应用
await init_clickhouse()
if acquired:
try:
import os
os.rmdir(lock_dir)
except Exception:
pass

82
app/core/ip_tracking.py Normal file
View File

@ -0,0 +1,82 @@
from datetime import datetime
from typing import Any
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request
from starlette.responses import Response
from app.models.metrics import IpUploadStats
class IpTrackingMiddleware(BaseHTTPMiddleware):
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
try:
response = await call_next(request)
except Exception as e:
# Let other middleware or exception handlers handle it
raise e
try:
path = request.url.path
if path.startswith("/api/v1/universal/data") or path.startswith("/api/v1/boss/") or path.startswith("/api/v1/qcwy/") or path.startswith("/api/v1/zhilian/"):
args = getattr(request.state, "request_args", {})
source = args.get("platform") or (
"boss" if path.startswith("/api/v1/boss/") else "qcwy" if path.startswith("/api/v1/qcwy/") else "zhilian" if path.startswith("/api/v1/zhilian/") else ""
)
ip = self._extract_ip(request)
count = self._estimate_count(args, response)
if source and ip and count:
await self._update_stats(source, ip, count)
except Exception:
pass
return response
def _extract_ip(self, request: Request) -> str:
xfwd = request.headers.get("x-forwarded-for") or request.headers.get("X-Forwarded-For")
if xfwd:
return xfwd.split(",")[0].strip()
xreal = request.headers.get("x-real-ip") or request.headers.get("X-Real-IP")
if xreal:
return xreal.strip()
return request.client.host if request.client else ""
def _estimate_count(self, args: dict, response: Response) -> int:
try:
# 同步接口:从响应体的数据段读取成功数量
if hasattr(response, "body") and response.body:
import json
data = json.loads(response.body)
if isinstance(data, dict) and isinstance(data.get("data"), dict):
d = data["data"]
if "success" in d:
return int(d.get("success", 0))
# 异步接口或无详细响应:按请求体估算
if "data_list" in args and isinstance(args.get("data_list"), list):
return len(args.get("data_list"))
if "data" in args:
return 1
except Exception:
pass
return 0
async def _update_stats(self, source: str, ip: str, inc: int) -> None:
from datetime import timezone
# 使用timezone-aware datetime确保与数据库中的datetime类型一致
now = datetime.now(timezone.utc)
today = now.date()
obj = await IpUploadStats.get_or_none(source=source, ip=ip, date=today)
if obj:
obj.upload_count = obj.upload_count + inc
obj.last_report_at = now
if getattr(obj, "status", "normal") != "normal":
obj.status = "normal"
await obj.save()
else:
await IpUploadStats.create(
source=source,
ip=ip,
date=today,
upload_count=inc,
last_report_at=now,
status="normal",
)

75
app/core/locks.py Normal file
View File

@ -0,0 +1,75 @@
import os
import time
import uuid
from contextlib import asynccontextmanager
class DistributedLock:
"""分布式锁封装,优先使用 Redis不可用时降级为文件锁"""
def __init__(self, name: str, ttl_seconds: int = 600):
self.name = name
self.ttl = ttl_seconds
self.token = str(uuid.uuid4())
self._use_redis = False
self._redis = None
self._file_path = f".lock_{self.name}"
try:
import redis # type: ignore
from app.settings.config import settings
self._redis = redis.Redis(
host=getattr(settings, "REDIS_HOST", None) or "",
port=getattr(settings, "REDIS_PORT", 6379),
db=getattr(settings, "REDIS_DB", 0),
password=getattr(settings, "REDIS_PASS", None) or None,
socket_timeout=3,
)
# 尝试 ping
if self._redis.ping():
self._use_redis = True
except Exception:
self._use_redis = False
async def acquire(self) -> bool:
"""获取锁,返回是否成功"""
if self._use_redis and self._redis is not None:
try:
# NX+EX 设置锁,避免竞争
return bool(self._redis.set(f"lock:{self.name}", self.token, nx=True, ex=self.ttl))
except Exception:
pass
# 文件锁降级(单机安全)
try:
os.mkdir(self._file_path)
return True
except Exception:
return False
async def release(self) -> None:
"""释放锁"""
if self._use_redis and self._redis is not None:
try:
# 简单释放;生产建议使用 Lua 脚本确保原子性
key = f"lock:{self.name}"
val = self._redis.get(key)
if val and val.decode() == self.token:
self._redis.delete(key)
except Exception:
pass
try:
os.rmdir(self._file_path)
except Exception:
pass
@asynccontextmanager
async def context(self):
"""上下文管理:获取成功才进入"""
acquired = await self.acquire()
try:
if acquired:
yield True
else:
yield False
finally:
if acquired:
await self.release()

182
app/core/middlewares.py Normal file
View File

@ -0,0 +1,182 @@
import json
import re
from datetime import datetime
from typing import Any, AsyncGenerator
from fastapi import FastAPI
from fastapi.responses import Response
from fastapi.routing import APIRoute
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
from starlette.requests import Request
from starlette.types import ASGIApp, Receive, Scope, Send
from app.core.dependency import AuthControl
from app.models.admin import AuditLog, User
from .bgtask import BgTasks
class SimpleBaseMiddleware:
def __init__(self, app: ASGIApp) -> None:
self.app = app
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
if scope["type"] != "http":
await self.app(scope, receive, send)
return
request = Request(scope, receive=receive)
response = await self.before_request(request) or self.app
await response(request.scope, request.receive, send)
await self.after_request(request)
async def before_request(self, request: Request):
return self.app
async def after_request(self, request: Request):
return None
class BackGroundTaskMiddleware(SimpleBaseMiddleware):
async def before_request(self, request):
await BgTasks.init_bg_tasks_obj()
async def after_request(self, request):
await BgTasks.execute_tasks()
class HttpAuditLogMiddleware(BaseHTTPMiddleware):
def __init__(self, app, methods: list[str], exclude_paths: list[str]):
super().__init__(app)
self.methods = methods
self.exclude_paths = exclude_paths
self.audit_log_paths = ["/api/v1/auditlog/list"]
self.max_body_size = 1024 * 1024 # 1MB 响应体大小限制
async def get_request_args(self, request: Request) -> dict:
args = {}
# 获取查询参数
for key, value in request.query_params.items():
args[key] = value
# 获取请求体
if request.method in ["POST", "PUT", "PATCH"]:
try:
body = await request.json()
args.update(body)
except json.JSONDecodeError:
try:
body = await request.form()
# args.update(body)
for k, v in body.items():
if hasattr(v, "filename"): # 文件上传行为
args[k] = v.filename
elif isinstance(v, list) and v and hasattr(v[0], "filename"):
args[k] = [file.filename for file in v]
else:
args[k] = v
except Exception:
pass
return args
async def get_response_body(self, request: Request, response: Response) -> Any:
# 检查Content-Length
content_length = response.headers.get("content-length")
if content_length and int(content_length) > self.max_body_size:
return {"code": 0, "msg": "Response too large to log", "data": None}
if hasattr(response, "body"):
body = response.body
else:
body_chunks = []
async for chunk in response.body_iterator:
if not isinstance(chunk, bytes):
chunk = chunk.encode(response.charset)
body_chunks.append(chunk)
response.body_iterator = self._async_iter(body_chunks)
body = b"".join(body_chunks)
if any(request.url.path.startswith(path) for path in self.audit_log_paths):
try:
data = self.lenient_json(body)
# 只保留基本信息,去除详细的响应内容
if isinstance(data, dict):
data.pop("response_body", None)
if "data" in data and isinstance(data["data"], list):
for item in data["data"]:
item.pop("response_body", None)
return data
except Exception:
return None
return self.lenient_json(body)
def lenient_json(self, v: Any) -> Any:
if isinstance(v, (str, bytes)):
try:
return json.loads(v)
except (ValueError, TypeError):
pass
return v
async def _async_iter(self, items: list[bytes]) -> AsyncGenerator[bytes, None]:
for item in items:
yield item
async def get_request_log(self, request: Request, response: Response) -> dict:
"""
根据request和response对象获取对应的日志记录数据
"""
data: dict = {"path": request.url.path, "status": response.status_code, "method": request.method, "summary": "", "module": ""}
# 路由信息
app: FastAPI = request.app
for route in app.routes:
if (
isinstance(route, APIRoute)
and route.path_regex.match(request.url.path)
and request.method in route.methods
):
data["module"] = ",".join(route.tags) if route.tags else ""
data["summary"] = route.summary or ""
# 获取用户信息
try:
token = request.headers.get("token")
user_obj = None
if token:
user_obj: User = await AuthControl.is_authed(token)
data["user_id"] = user_obj.id if user_obj else 0
data["username"] = user_obj.username if user_obj else ""
except Exception:
data["user_id"] = 0
data["username"] = ""
return data
async def before_request(self, request: Request):
request_args = await self.get_request_args(request)
request.state.request_args = request_args
async def after_request(self, request: Request, response: Response, process_time: int):
if request.method in self.methods:
for path in self.exclude_paths:
if re.search(path, request.url.path, re.I) is not None:
return
data: dict = await self.get_request_log(request=request, response=response)
data["response_time"] = process_time
data["request_args"] = request.state.request_args
data["response_body"] = await self.get_response_body(request, response)
await AuditLog.create(**data)
return response
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
start_time: datetime = datetime.now()
await self.before_request(request)
response = await call_next(request)
end_time: datetime = datetime.now()
process_time = int((end_time.timestamp() - start_time.timestamp()) * 1000)
await self.after_request(request, response, process_time)
return response

65
app/core/proxy_rule.py Normal file
View File

@ -0,0 +1,65 @@
import json
import re
from typing import Any, Dict, List
from app.models.cleaning import ProxyProvider
def _resolve_path(data: Any, path: str) -> Any:
if not path:
return data
current = data
for part in path.split("."):
if isinstance(current, list):
try:
index = int(part)
except ValueError:
return None
if index < 0 or index >= len(current):
return None
current = current[index]
elif isinstance(current, dict):
current = current.get(part)
else:
return None
return current
def parse_proxies(raw_body: str, provider: ProxyProvider) -> List[str]:
mode = provider.mode
template = provider.template or "{ip}:{port}"
result: List[str] = []
if mode == "json":
obj = json.loads(raw_body)
items = _resolve_path(obj, provider.list_path) if provider.list_path else obj
if items is None:
return result
if not isinstance(items, list):
items = [items]
for item in items:
context: Dict[str, Any] = {}
if provider.ip_path:
context["ip"] = _resolve_path(item, provider.ip_path)
if provider.port_path:
context["port"] = _resolve_path(item, provider.port_path)
if provider.username_path:
context["username"] = _resolve_path(item, provider.username_path)
if provider.password_path:
context["password"] = _resolve_path(item, provider.password_path)
result.append(template.format(**context))
return result
if mode == "text":
if not provider.pattern:
return result
pattern = re.compile(provider.pattern)
for match in pattern.finditer(raw_body):
context = match.groupdict()
result.append(template.format(**context))
return result
return result
async def parse_proxies_with_provider(provider_id: int, raw_body: str) -> List[str]:
provider = await ProxyProvider.get(id=provider_id)
return parse_proxies(raw_body, provider)

352
app/core/scheduler.py Normal file
View File

@ -0,0 +1,352 @@
import asyncio
import json
import uuid
from datetime import datetime, timedelta
import subprocess
import sys
from pathlib import Path
from apscheduler.schedulers.asyncio import AsyncIOScheduler
from apscheduler.triggers.cron import CronTrigger
from app.core.clickhouse import clickhouse_manager
from app.core.locks import DistributedLock
from app.log import logger
from app.settings.config import settings
from app.models.metrics import ScheduledTaskRun, StatsTotal
scheduler: AsyncIOScheduler | None = None
async def _record_task_run(task_id: str, task_name: str, status: str, started_at: datetime, error: str | None = None):
"""记录任务运行状态"""
finished_at = datetime.now()
duration_ms = int((finished_at.timestamp() - started_at.timestamp()) * 1000)
await ScheduledTaskRun.create(
task_id=task_id,
task_name=task_name,
status=status,
started_at=started_at,
finished_at=finished_at,
duration_ms=duration_ms,
error=error or "",
)
async def stats_job():
"""每6小时执行一次统计 ClickHouse 各表总量并上报"""
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "stats_job"
lock = DistributedLock(name=task_name, ttl_seconds=600)
async with lock.context() as acquired:
if not acquired:
logger.info("stats_job skipped: lock not acquired")
return
try:
client = await clickhouse_manager.get_client()
tables = [
("boss", "job", "boss_job"),
("qcwy", "job", "qcwy_job"),
("zhilian", "job", "zhilian_job"),
("boss", "company", "boss_company"),
("qcwy", "company", "qcwy_company"),
("zhilian", "company", "zhilian_company"),
]
results: list[dict] = []
for source, data_type, table in tables:
total_sql = f"SELECT COUNT() AS cnt FROM job_data.{table}"
total_rows = await client.query(total_sql)
total_count = int(total_rows.result_rows[0][0]) if total_rows.result_rows else 0
await StatsTotal.create(source=source, table_type=data_type, count=total_count, ts=datetime.now())
daily_sql = (
f"SELECT COUNT() AS cnt FROM job_data.{table} "
f"WHERE created_at >= toStartOfDay(now()) AND created_at < toStartOfDay(now()) + INTERVAL 1 DAY"
)
daily_rows = await client.query(daily_sql)
daily_count = int(daily_rows.result_rows[0][0]) if daily_rows.result_rows else 0
results.append({
"source": source,
"type": data_type,
"table": table,
"total": total_count,
"daily_new": daily_count,
})
payload = {
"task_id": task_id,
"ts": datetime.now().isoformat(),
"totals": results,
}
await _post_with_retry(json.dumps(payload))
await _send_email("6小时数据统计", payload)
await _record_task_run(task_id, task_name, "success", started_at)
except Exception as e:
logger.error(f"stats_job failed: {e}")
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def ip_alert_job():
"""每10分钟执行检测最近窗口未上报的IP并告警"""
from app.models.metrics import IpUploadStats # 延迟导入避免循环
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "ip_alert_job"
lock = DistributedLock(name=task_name, ttl_seconds=300)
async with lock.context() as acquired:
if not acquired:
logger.info("ip_alert_job skipped: lock not acquired")
return
try:
window_minutes = getattr(settings, "ALERT_WINDOW_MINUTES", 10)
# 使用timezone-aware datetime避免与数据库中的datetime比较时出错
from datetime import timezone
# 创建aware datetimeUTC时区
now = datetime.now(timezone.utc)
threshold = now - timedelta(minutes=window_minutes)
cutoff = now.date()
items = await IpUploadStats.filter(date=cutoff).all()
anomalies: list[dict] = []
for item in items:
last_at = getattr(item, "last_report_at", None)
# 如果last_at是naive datetime转换为aware datetime进行比较
if last_at is not None:
# 检查是否是naive datetime没有tzinfo
if last_at.tzinfo is None:
# 假设数据库存储的是UTC时间转换为aware datetime
last_at = last_at.replace(tzinfo=timezone.utc)
if last_at is None or last_at < threshold:
if getattr(item, "status", "normal") != "abnormal":
item.status = "abnormal"
await item.save(update_fields=["status"])
anomalies.append({
"source": item.source,
"ip": item.ip,
"last_report_at": last_at.isoformat() if last_at else None,
"window_minutes": window_minutes,
})
if anomalies:
payload = {"task_id": task_id, "ts": datetime.now().isoformat(), "anomalies": anomalies}
await _post_with_retry(json.dumps(payload))
await _send_email("IP上报异常告警", payload)
duration = (datetime.now() - started_at).total_seconds()
logger.info(f"ip_alert_job completed in {duration:.2f} seconds")
await _record_task_run(task_id, task_name, "success", started_at)
except Exception as e:
logger.error(f"ip_alert_job failed: {e}")
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def ecs_full_pipeline_job():
"""每6小时执行一次运行 ecs_full_pipeline.py 完整流程并记录结果"""
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "ecs_full_pipeline"
lock = DistributedLock(name=task_name, ttl_seconds=1800)
async with lock.context() as acquired:
if not acquired:
logger.info("ecs_full_pipeline skipped: lock not acquired")
return
try:
root = Path(__file__).resolve().parents[2]
script = root / "ecs_full_pipeline.py"
log = root / "ecs_full_pipeline.log"
with open(log, "a", encoding="utf-8") as f:
f.write(f"\n[定时] 开始执行 pipeline{started_at.isoformat()}\n")
proc = await asyncio.to_thread(
subprocess.run,
[sys.executable, "-u", str(script)],
stdout=f,
stderr=f,
text=True,
)
status = "success" if proc.returncode == 0 else "fail"
await _record_task_run(task_id, task_name, status, started_at, None if status == "success" else f"rc={proc.returncode}")
except Exception as e:
logger.error(f"ecs_full_pipeline failed: {e}")
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def company_cleaning_job():
"""每5分钟执行自动清洗待处理公司数据"""
from app.services.company_cleaner import company_cleaner
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "company_cleaning_job"
# Use a shorter lock TTL since it runs frequently
lock = DistributedLock(name=task_name, ttl_seconds=300)
async with lock.context() as acquired:
if not acquired:
logger.info("company_cleaning_job skipped: lock not acquired")
return
try:
logger.info("Running automated company cleaning job...")
# 1. Collect new data (with 7-day rule)
# 减少数量确保在5分钟内完成
await company_cleaner.collect_pending_companies(limit=50)
# 2. Process pending data with small delay to be polite
# 减少数量确保在5分钟内完成30个公司每个约3-5秒加上延迟总计约2-3分钟
# 这样留出时间给收集任务和其他操作
await company_cleaner.process_pending_companies(limit=30, max_delay_seconds=1)
duration = (datetime.now() - started_at).total_seconds()
logger.info(f"company_cleaning_job completed in {duration:.2f} seconds")
await _record_task_run(task_id, task_name, "success", started_at)
except Exception as e:
logger.error(f"company_cleaning_job failed: {e}")
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def daily_cleanup_job():
"""每天 00:05 执行:清理已完成的任务记录"""
from app.services.company_cleaner import company_cleaner
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "daily_cleanup_job"
lock = DistributedLock(name=task_name, ttl_seconds=3600)
async with lock.context() as acquired:
if not acquired:
return
try:
logger.info("Running daily cleanup job...")
await company_cleaner.cleanup_old_records()
await _record_task_run(task_id, task_name, "success", started_at)
except Exception as e:
logger.error(f"daily_cleanup_job failed: {e}")
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
async def _post_with_retry(body: str):
"""带失败重试的统计结果上报"""
import httpx
endpoint = getattr(settings, "REPORT_ENDPOINT", "")
if not endpoint:
logger.warning("REPORT_ENDPOINT not configured; skip reporting")
return
max_retries = getattr(settings, "REPORT_MAX_RETRIES", 3)
timeout = getattr(settings, "REPORT_TIMEOUT", 10)
async with httpx.AsyncClient(timeout=timeout) as client:
for attempt in range(1, max_retries + 1):
try:
resp = await client.post(endpoint, headers={"Content-Type": "application/json"}, content=body)
if 200 <= resp.status_code < 300:
return
raise RuntimeError(f"status={resp.status_code} body={resp.text}")
except Exception as e:
logger.warning(f"report attempt {attempt} failed: {e}")
await asyncio.sleep(min(5 * attempt, 15))
def _build_email_html(subject: str, payload: dict) -> str:
"""构建HTML邮件内容"""
ts = payload.get("ts") or datetime.now().isoformat()
style = (
"body{font-family:Arial,Helvetica,sans-serif;background:#f7f7f9;color:#333;}"
"h1{font-size:20px;margin:0 0 10px;}"
"p.meta{color:#666;font-size:12px;margin:0 0 16px;}"
"table{border-collapse:collapse;width:100%;background:#fff;border:1px solid #e5e7eb;}"
"th,td{border:1px solid #e5e7eb;padding:8px;text-align:left;font-size:13px;}"
"th{background:#f3f4f6;}"
".section{margin-top:18px;}"
".badge{display:inline-block;background:#2563eb;color:#fff;border-radius:12px;padding:2px 8px;font-size:12px;margin-left:8px;}"
)
html_head = f"<h1>{subject}<span class=\"badge\">{ts}</span></h1><p class=\"meta\">自动统计与通知</p>"
if "totals" in payload:
rows = "".join(
f"<tr><td>{r.get('source')}</td><td>{r.get('type')}</td><td>{r.get('table')}</td><td>{r.get('total')}</td><td>{r.get('daily_new')}</td></tr>"
for r in payload.get("totals", [])
)
table = f"<table><thead><tr><th>来源</th><th>类型</th><th>表名</th><th>总量</th><th>今日新增</th></tr></thead><tbody>{rows}</tbody></table>"
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
if "anomalies" in payload:
rows = "".join(
f"<tr><td>{a.get('source')}</td><td>{a.get('ip')}</td><td>{a.get('date')}</td></tr>" for a in payload.get("anomalies", [])
)
table = f"<table><thead><tr><th>来源</th><th>IP</th><th>日期</th></tr></thead><tbody>{rows}</tbody></table>"
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
body = json.dumps(payload, ensure_ascii=False, indent=2)
pre = f"<pre style='background:#111827;color:#e5e7eb;padding:12px;border-radius:6px;overflow:auto;font-size:12px;'>{body}</pre>"
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{pre}</body></html>"
async def _send_email(subject: str, payload: dict):
"""发送HTML邮件通知"""
import smtplib
from email.mime.text import MIMEText
from email.utils import formataddr
host = getattr(settings, "SMTP_HOST", "")
user = getattr(settings, "SMTP_USER", "")
password = getattr(settings, "SMTP_PASS", "")
sender = getattr(settings, "SMTP_FROM", user)
recipients = getattr(settings, "SMTP_TO", ["zfc9393@163.com"]) or ["zfc9393@163.com"]
if not host or not user or not password:
logger.warning("SMTP not configured; skip email sending")
return
html = _build_email_html(subject, payload)
msg = MIMEText(html, "html", "utf-8")
msg["Subject"] = subject
msg["From"] = formataddr(("JobData", sender))
msg["To"] = ", ".join(recipients)
try:
server = smtplib.SMTP(host, getattr(settings, "SMTP_PORT", 587))
server.starttls()
server.login(user, password)
server.sendmail(sender, recipients, msg.as_string())
server.quit()
except Exception as e:
logger.error(f"email send failed: {e}")
def start_scheduler():
"""启动全局调度器并注册任务"""
global scheduler
if scheduler is not None:
return
# 配置调度器允许任务延迟执行减少missed警告
# 注意max_instances设置为3允许任务排队实际并发控制通过分布式锁实现
scheduler = AsyncIOScheduler(
job_defaults={
'coalesce': True, # 合并多个待执行的任务
'max_instances': 3, # 允许最多3个实例排队实际并发由分布式锁控制
'misfire_grace_time': 600 # 允许600秒的延迟避免missed警告10分钟
}
)
# 每6小时触发
scheduler.add_job(stats_job, CronTrigger(second=0, minute=0, hour="*/6"), id="stats_job", replace_existing=True)
# 每6小时触发执行 ECS 全流程
scheduler.add_job(ecs_full_pipeline_job, CronTrigger(second=0, minute=0, hour="*/6"), id="ecs_full_pipeline", replace_existing=True)
# 每10分钟触发告警
scheduler.add_job(ip_alert_job, CronTrigger(second=0, minute="*/10"), id="ip_alert_job", replace_existing=True)
# 每5分钟执行自动清洗
# 使用max_instances=3允许任务排队但分布式锁确保同一时间只有一个实例真正执行
scheduler.add_job(
company_cleaning_job,
CronTrigger(second=0, minute="*/5"),
id="company_cleaning_job",
replace_existing=True,
max_instances=3 # 允许最多3个实例排队实际执行由分布式锁控制
)
# 每天 00:05 执行:清理历史记录
scheduler.add_job(daily_cleanup_job, CronTrigger(second=0, minute=5, hour=0), id="daily_cleanup_job", replace_existing=True)
scheduler.start()
def shutdown_scheduler():
"""关闭调度器"""
global scheduler
if scheduler is not None:
scheduler.shutdown(wait=False)
scheduler = None

View File

@ -0,0 +1,33 @@
# ALIGNMENT — qcwy_signature_refactor
阶段: Align对齐阶段
任务名称: 前程无忧 qcwy_api 签名函数提取与爬虫代码重构(风格对齐 boss_api.py
1. 项目上下文分析
- 技术栈: Python, httpx/requests, FastAPI服务端前端 Vue3web
- 相关文件: new2025/boss/boss_api.py风格参考、new2025/qcwy/qcwy_api.py待重构
- 现有模式: qcwy_api.py 内部包含 ProxyConfig/ProxyManager/RemoteReporter/SignatureGenerator/JobCrawler 等类,使用 httpx.Client 进行请求boss_api.py 使用 requests.Session 并提供统一日志/调试输出风格。
- 业务域: 爬取 51job 小程序接口的职位数据,并将职位与公司详情按约定上报至后端 universal data 接口。
2. 原始需求与边界确认
- 需求: 在 qcwy_api.py 中“提取爬虫的签名函数”并进行“qcwy_api 重构”,使“代码风格和 boss_api 一致”,且“只需要替换爬虫相关的代码”。
- 边界: 不改动非爬虫相关的逻辑(如数据结构、业务上报接口契约),尽量避免引入新的外部依赖;维持 httpx 客户端使用与现有方法签名一致性。
3. 需求理解
- 提取签名函数: 将签名生成逻辑抽到独立函数 generate_qcwy_signature支持 url_path + 请求体签名;布尔值需转 "true"/"false"HMAC-SHA256 hexdigest。
- 提取 property 构建函数: 独立函数 build_qcwy_property输出 URL 编码的 JSONJobCrawler.build_property 调用该函数。
- 风格对齐: 顶部打印 [DEBUG] 模块加载提示logging.basicConfig 增加 StreamHandler函数注释采用 boss_api.py 风格。
4. 智能决策策略
- 参考 boss_api.py 的结构与日志风格,不强行将 httpx 改成 requests以减少改动面与风险仅在签名与 property 构建、日志风格上进行统一。
- 复用现有接口调用流程,确保对外契约不变。
5. 疑问澄清(需确认)
- 是否需要将 httpx.Client 改为 requests.Session 以进一步风格一致?当前按“只替换爬虫相关代码”理解为保持 httpx。
- 签名 key 是否固定由调用方维护?目前保留原始默认值并通过依赖注入传入。
6. 验收标准
- qcwy_api.py 中存在可复用的 generate_qcwy_signature 与 build_qcwy_property 函数,并在 JobCrawler 内部调用。
- 日志与 debug 输出风格与 boss_api.py 保持一致。
- 现有功能(如 get_recommend_jobs保持原有行为不破坏接口契约与上报逻辑。

View File

@ -0,0 +1,25 @@
# CONSENSUS — qcwy_signature_refactor
阶段: Align 最终共识
明确的需求与验收标准
- 需求: 提取 51job 爬虫的签名函数与 property 构建函数为独立方法,并在 JobCrawler 内复用;对齐 boss_api.py 的日志与调试输出风格;仅修改与爬虫相关的代码。
- 验收标准:
- 存在 generate_qcwy_signature(sign_key, url_path, data) 独立函数,输出与原有签名一致。
- 存在 build_qcwy_property(manual_login_method, page_code) 独立函数,并在 JobCrawler.build_property 内复用。
- 在模块顶部添加 [DEBUG] 输出与 logging StreamHandler日志风格一致。
- 不影响对外调用契约与数据上报逻辑。
技术实现方案与约束
- 保持 httpx.Client 以减少非爬虫相关的改动;签名与 property 独立函数供各 API 组装复用。
- 函数注释与类型标注齐全;布尔值在签名前转换为字符串,确保线上一致性。
- 不引入新依赖,严格遵守 .env 管理敏感配置的规范。
集成方案
- JobCrawler 内部继续使用 self.signature_generator.generate_signature 与 self.build_property但内部转调独立函数保证现有方法签名不变。
边界与限制
- 不调整代理与上报接口的现有实现;如需进一步统一至 requests.Session需单独评审并更新 DESIGN/TASK 文档。
不确定性状态
- 当前均已解决;若后续出现兼容性问题,将在 TASK/ACCEPTANCE 文档中记录并处置。

View File

@ -0,0 +1,60 @@
# DESIGN — qcwy_signature_refactor
1) 整体架构图
```mermaid
flowchart TD
A[JobCrawler 调用层] --> B[签名生成 generate_qcwy_signature]
A --> C[property 构建 build_qcwy_property]
A --> D[httpx.Client 请求发送]
D --> E[51job API]
A --> F[RemoteReporter 数据上报]
```
2) 分层设计与核心组件
- Crawler 层JobCrawler业务编排、参数组装、headers签名与property、调试打印、调用 _make_request
- Signature 层generate_qcwy_signature独立函数HMAC-SHA256签名+ SignatureGenerator兼容旧接口
- Property 层build_qcwy_property独立函数URL编码后的JSON
- Transport 层httpx.Client保留会话与代理切换
- Reporter 层RemoteReporter上报到后端 universal data
3) 模块依赖关系图
```mermaid
flowchart LR
JobCrawler --> SignatureGenerator --> generate_qcwy_signature
JobCrawler --> build_qcwy_property
JobCrawler --> httpxClient
JobCrawler --> RemoteReporter
```
4) 接口契约定义
- generate_qcwy_signature(sign_key: str, url_path: str, data: Optional[Dict]) -> str
- build_qcwy_property(manual_login_method: str = "", page_code: str = "home|hotjob|jobfxlist") -> str
- SignatureGenerator.generate_signature(url_path: str, data: Optional[Dict]) -> str
5) 数据流向图
```mermaid
sequenceDiagram
participant J as JobCrawler
participant S as Signature
participant P as Property
participant X as httpx.Client
participant API as 51job API
participant R as RemoteReporter
J->>S: 生成签名
J->>P: 构建 property
J->>X: 发送请求(headers含签名与property)
X->>API: HTTP 请求
API-->>J: 返回 JSON 响应
J->>R: 上报数据(批量)
```
6) 异常处理策略
- 代理异常:在 _make_request 中检测典型错误码/异常关键字,触发 _switch_proxy_and_reinit。
- 网络异常指数退避重试最多3次返回 None 时上层做空值兜底。
- 解析异常:捕获并记录日志,返回空集合/空字典,避免中断流水线。
7) 设计原则
- 任务范围内的最小改动仅替换爬虫相关签名与property构建逻辑。
- 保持对外契约不变:保留 JobCrawler 与 Reporter 行为接口。
- 与 boss_api 风格对齐:日志配置、调试输出统一。

View File

@ -0,0 +1,33 @@
# TASK — qcwy_signature_refactor
1) 原子任务拆分
- T1 提取独立签名函数 generate_qcwy_signature输入sign_key/url_path/data输出签名字符串约束HMAC-SHA256
- T2 提取独立 property 构建函数 build_qcwy_property输入manual_login_method/page_code输出URL编码字符串
- T3 JobCrawler 复用新函数并统一调试输出输入URL/headers/body输出HTTP响应
- T4 扩充函数级注释(方法与参数/返回说明),满足代码输出标准
- T5 编写单元测试:签名一致性/布尔值转换/属性编码有效性
- T6 文档与进度同步ALIGNMENT/CONSENSUS/DESIGN/TASK/ACCEPTANCE/FINAL/TODO
2) 依赖关系
```mermaid
flowchart TD
T1 --> T3
T2 --> T3
T3 --> T4
T3 --> T5
T4 --> T6
T5 --> T6
```
3) 任务详细约束
- 输入契约:
- T1: sign_key(url_path,data)
- T2: manual_login_method,page_code
- T3: 复用T1/T2后组装请求
- 输出契约:
- T1/T2: 字符串且可用于HTTP headers
- T3: 成功返回JSON字典或空值兜底
- 实现约束:
- 与 boss_api 风格统一;不变更对外接口;不引入新依赖
- 验收标准:
- 单元测试通过;日志与调试输出统一;文档完整

View File

@ -0,0 +1,297 @@
# 通用数据路由API使用指南
## 概述
通用数据路由系统提供了一个统一的API接口可以根据数据类型和平台自动将数据路由到对应的ClickHouse表中进行存储。这个系统支持所有主要的招聘平台Boss直聘、前程无忧、智联招聘和数据类型职位、公司
## 核心特性
- **统一接口**: 所有平台的数据都通过相同的API接口处理
- **自动路由**: 根据数据类型和平台自动选择对应的存储表
- **重复检查**: 支持自动重复数据检查,避免数据冗余
- **批量处理**: 支持单条和批量数据处理
- **异步处理**: 支持异步后台任务处理大量数据
- **数据预处理**: 自动进行字段映射和数据转换
## API端点
### 基础路径
```
/api/v1/universal
```
### 主要端点
1. **存储单条数据**: `POST /data/store`
2. **批量存储数据**: `POST /data/batch-store`
3. **异步存储单条数据**: `POST /data/store-async`
4. **异步批量存储数据**: `POST /data/batch-store-async`
5. **获取支持的平台**: `GET /platforms`
### 平台特定便捷端点
- `POST /boss/job` - Boss直聘职位数据
- `POST /boss/company` - Boss直聘公司数据
- `POST /qcwy/job` - 前程无忧职位数据
- `POST /qcwy/company` - 前程无忧公司数据
- `POST /zhilian/job` - 智联招聘职位数据
- `POST /zhilian/company` - 智联招聘公司数据
## 数据类型和平台
### 支持的平台
- `boss` - Boss直聘
- `qcwy` - 前程无忧
- `zhilian` - 智联招聘
### 支持的数据类型
- `job` - 职位数据
- `company` - 公司数据
### 默认重复检查字段
| 平台 | 职位数据 | 公司数据 |
|------|----------|----------|
| boss | encrypt_job_id | encrypt_id |
| qcwy | job_id | company_id |
| zhilian | job_id | company_id |
## 使用示例
### 1. 存储Boss直聘职位数据
```json
POST /api/v1/universal/data/store
Content-Type: application/json
{
"data": {
"securityId": "abc123",
"jobBaseInfoVO": {
"encryptJobId": "job_encrypt_123",
"jobId": 12345,
"positionName": "Python开发工程师",
"locationName": "北京",
"lowSalary": 15000,
"highSalary": 25000,
"jobDesc": "负责后端开发工作..."
},
"bossBaseInfoVO": {
"encryptBossId": "boss_encrypt_456",
"bossId": 67890,
"bossName": "张经理"
},
"brandComInfoVO": {
"brandName": "某科技公司",
"industryName": "互联网",
"scaleName": "100-499人"
}
},
"data_type": "job",
"platform": "boss",
"check_duplicate": true,
"duplicate_key": "encrypt_job_id"
}
```
### 2. 批量存储前程无忧职位数据
```json
POST /api/v1/universal/data/batch-store
Content-Type: application/json
{
"data_list": [
{
"job_name": "Java开发工程师",
"brand_name": "某互联网公司",
"job_addr": "上海",
"salary_desc": "15k-25k",
"job_desc": "负责Java后端开发..."
},
{
"job_name": "前端开发工程师",
"brand_name": "某科技公司",
"job_addr": "深圳",
"salary_desc": "12k-20k",
"job_desc": "负责前端页面开发..."
}
],
"data_type": "job",
"platform": "qcwy",
"check_duplicate": true
}
```
### 3. 使用便捷端点存储数据
```json
POST /api/v1/universal/boss/job
Content-Type: application/json
{
"securityId": "abc123",
"jobBaseInfoVO": {
"encryptJobId": "job_encrypt_789",
"positionName": "产品经理",
"locationName": "杭州"
},
"brandComInfoVO": {
"brandName": "某电商公司",
"industryName": "电子商务"
}
}
```
### 4. 异步批量处理大量数据
```json
POST /api/v1/universal/data/batch-store-async
Content-Type: application/json
{
"data_list": [
// ... 大量数据
],
"data_type": "job",
"platform": "zhilian",
"check_duplicate": true
}
```
## 响应格式
### 成功响应
```json
{
"code": 200,
"message": "数据存储成功",
"data": {
"success": true,
"message": "数据存储成功",
"duplicate": false,
"result": true
},
"platform": "boss",
"data_type": "job"
}
```
### 重复数据响应
```json
{
"code": 400,
"message": "数据已存在",
"data": {
"success": false,
"message": "数据已存在",
"duplicate": true,
"existing_id": 12345
},
"platform": "boss",
"data_type": "job"
}
```
### 批量处理响应
```json
{
"code": 200,
"message": "批量处理完成: 成功 8 条,失败 1 条,重复 1 条",
"data": {
"total": 10,
"success": 8,
"failed": 1,
"duplicate": 1,
"errors": [
{
"index": 5,
"error": "缺少必要字段"
}
]
},
"platform": "qcwy",
"data_type": "job"
}
```
### 异步处理响应
```json
{
"code": 202,
"message": "数据已加入异步处理队列",
"platform": "zhilian",
"data_type": "job"
}
```
## 数据预处理
系统会根据不同平台自动进行数据预处理:
### Boss直聘数据预处理
- 从嵌套的VO对象中提取字段
- 映射到标准的数据库字段
- 保留原始数据在`raw_data`字段中
### 前程无忧数据预处理
- 直接映射字段名
- 处理数组类型字段
- 标准化日期格式
### 智联招聘数据预处理
- 处理复杂的嵌套结构
- 提取关键信息字段
- 转换数据类型
## 错误处理
### 常见错误码
- `400` - 请求参数错误或数据已存在
- `500` - 服务器内部错误
- `202` - 异步处理已接受
### 错误响应示例
```json
{
"detail": "数据存储失败: 缺少必要字段 'encrypt_job_id'"
}
```
## 最佳实践
1. **使用重复检查**: 始终启用重复检查以避免数据冗余
2. **批量处理**: 对于大量数据,使用批量接口提高效率
3. **异步处理**: 对于非实时需求,使用异步接口避免超时
4. **错误处理**: 妥善处理API返回的错误信息
5. **数据验证**: 在发送前验证数据的完整性
## 监控和日志
系统会自动记录以下信息:
- 数据处理成功/失败统计
- 重复数据检测结果
- 处理时间和性能指标
- 错误详情和堆栈信息
可以通过应用日志查看详细的处理信息。
## 迁移指南
### 从平台特定API迁移
如果你之前使用平台特定的API`/api/v1/boss/job`),可以:
1. **继续使用便捷端点**: 使用`/api/v1/universal/boss/job`等便捷端点
2. **迁移到通用端点**: 使用`/api/v1/universal/data/store`通用端点
3. **逐步迁移**: 先测试新接口,确认无误后再完全迁移
### 数据格式兼容性
新的通用API完全兼容现有的数据格式无需修改数据结构。

1
app/log/__init__.py Normal file
View File

@ -0,0 +1 @@
from .log import logger as logger

25
app/log/log.py Normal file
View File

@ -0,0 +1,25 @@
import sys
from loguru import logger as loguru_logger
from app.settings import settings
class Loggin:
def __init__(self) -> None:
debug = settings.DEBUG
if debug:
self.level = "DEBUG"
else:
self.level = "INFO"
def setup_logger(self):
loguru_logger.remove()
loguru_logger.add(sink=sys.stdout, level=self.level)
# logger.add("my_project.log", level=level, rotation="100 MB") # Output log messages to a file
return loguru_logger
loggin = Loggin()
logger = loggin.setup_logger()

5
app/models/__init__.py Normal file
View File

@ -0,0 +1,5 @@
# 新增model需要在这里导入
from .admin import *
from .metrics import *
from .keyword import *
from .cleaning import *

89
app/models/admin.py Normal file
View File

@ -0,0 +1,89 @@
from tortoise import fields
from app.schemas.menus import MenuType
from .base import BaseModel, TimestampMixin
from .enums import MethodType
class User(BaseModel, TimestampMixin):
username = fields.CharField(max_length=20, unique=True, description="用户名称", index=True)
alias = fields.CharField(max_length=30, null=True, description="姓名", index=True)
email = fields.CharField(max_length=255, unique=True, description="邮箱", index=True)
phone = fields.CharField(max_length=20, null=True, description="电话", index=True)
password = fields.CharField(max_length=128, null=True, description="密码")
is_active = fields.BooleanField(default=True, description="是否激活", index=True)
is_superuser = fields.BooleanField(default=False, description="是否为超级管理员", index=True)
last_login = fields.DatetimeField(null=True, description="最后登录时间", index=True)
roles = fields.ManyToManyField("models.Role", related_name="user_roles")
dept_id = fields.IntField(null=True, description="部门ID", index=True)
class Meta:
table = "user"
class Role(BaseModel, TimestampMixin):
name = fields.CharField(max_length=20, unique=True, description="角色名称", index=True)
desc = fields.CharField(max_length=500, null=True, description="角色描述")
menus = fields.ManyToManyField("models.Menu", related_name="role_menus")
apis = fields.ManyToManyField("models.Api", related_name="role_apis")
class Meta:
table = "role"
class Api(BaseModel, TimestampMixin):
path = fields.CharField(max_length=100, description="API路径", index=True)
method = fields.CharEnumField(MethodType, description="请求方法", index=True)
summary = fields.CharField(max_length=500, description="请求简介", index=True)
tags = fields.CharField(max_length=100, description="API标签", index=True)
class Meta:
table = "api"
class Menu(BaseModel, TimestampMixin):
name = fields.CharField(max_length=20, description="菜单名称", index=True)
remark = fields.JSONField(null=True, description="保留字段")
menu_type = fields.CharEnumField(MenuType, null=True, description="菜单类型")
icon = fields.CharField(max_length=100, null=True, description="菜单图标")
path = fields.CharField(max_length=100, description="菜单路径", index=True)
order = fields.IntField(default=0, description="排序", index=True)
parent_id = fields.IntField(default=0, description="父菜单ID", index=True)
is_hidden = fields.BooleanField(default=False, description="是否隐藏")
component = fields.CharField(max_length=100, description="组件")
keepalive = fields.BooleanField(default=True, description="存活")
redirect = fields.CharField(max_length=100, null=True, description="重定向")
class Meta:
table = "menu"
class Dept(BaseModel, TimestampMixin):
name = fields.CharField(max_length=20, unique=True, description="部门名称", index=True)
desc = fields.CharField(max_length=500, null=True, description="备注")
is_deleted = fields.BooleanField(default=False, description="软删除标记", index=True)
order = fields.IntField(default=0, description="排序", index=True)
parent_id = fields.IntField(default=0, max_length=10, description="父部门ID", index=True)
class Meta:
table = "dept"
class DeptClosure(BaseModel, TimestampMixin):
ancestor = fields.IntField(description="父代", index=True)
descendant = fields.IntField(description="子代", index=True)
level = fields.IntField(default=0, description="深度", index=True)
class AuditLog(BaseModel, TimestampMixin):
user_id = fields.IntField(description="用户ID", index=True)
username = fields.CharField(max_length=64, default="", description="用户名称", index=True)
module = fields.CharField(max_length=64, default="", description="功能模块", index=True)
summary = fields.CharField(max_length=128, default="", description="请求描述", index=True)
method = fields.CharField(max_length=10, default="", description="请求方法", index=True)
path = fields.CharField(max_length=255, default="", description="请求路径", index=True)
status = fields.IntField(default=-1, description="状态码", index=True)
response_time = fields.IntField(default=0, description="响应时间(单位ms)", index=True)
request_args = fields.JSONField(null=True, description="请求参数")
response_body = fields.JSONField(null=True, description="返回数据")

62
app/models/base.py Normal file
View File

@ -0,0 +1,62 @@
import asyncio
from datetime import datetime
from tortoise import fields, models
from app.settings import settings
class BaseModel(models.Model):
id = fields.BigIntField(pk=True, index=True)
async def to_dict(self, m2m: bool = False, exclude_fields: list[str] | None = None):
if exclude_fields is None:
exclude_fields = []
d = {}
for field in self._meta.db_fields:
if field not in exclude_fields:
value = getattr(self, field)
if isinstance(value, datetime):
value = value.strftime(settings.DATETIME_FORMAT)
d[field] = value
if m2m:
tasks = [
self.__fetch_m2m_field(field, exclude_fields)
for field in self._meta.m2m_fields
if field not in exclude_fields
]
results = await asyncio.gather(*tasks)
for field, values in results:
d[field] = values
return d
async def __fetch_m2m_field(self, field, exclude_fields):
values = await getattr(self, field).all().values()
formatted_values = []
for value in values:
formatted_value = {}
for k, v in value.items():
if k not in exclude_fields:
if isinstance(v, datetime):
formatted_value[k] = v.strftime(settings.DATETIME_FORMAT)
else:
formatted_value[k] = v
formatted_values.append(formatted_value)
return field, formatted_values
class Meta:
abstract = True
class UUIDModel:
uuid = fields.UUIDField(unique=True, pk=False, index=True)
class TimestampMixin:
created_at = fields.DatetimeField(auto_now_add=True, index=True)
updated_at = fields.DatetimeField(auto_now=True, index=True)

45
app/models/cleaning.py Normal file
View File

@ -0,0 +1,45 @@
from tortoise import fields
from app.models.base import TimestampMixin, BaseModel
class CleaningTask(BaseModel, TimestampMixin):
target = fields.CharField(max_length=255, description="目标(URL/公司名/ID)")
clean_type = fields.CharField(max_length=50, default="auto", description="清洗模式")
platform = fields.CharField(max_length=50, default="auto", description="目标平台")
proxy = fields.CharField(max_length=255, null=True, description="代理地址")
status = fields.CharField(max_length=20, default="pending", description="状态: pending/processing/success/fail")
storage_status = fields.CharField(max_length=20, default="unknown", description="存储状态: saved/duplicate/failed/unknown")
remote_sent = fields.BooleanField(default=False, description="是否已远程推送")
result_summary = fields.JSONField(null=True, description="清洗结果摘要")
original_data = fields.JSONField(null=True, description="原始请求数据")
error_msg = fields.TextField(null=True, description="错误信息")
class Meta:
table = "cleaning_task"
class ProxyConfig(BaseModel, TimestampMixin):
name = fields.CharField(max_length=100, description="名称")
proxy_type = fields.CharField(max_length=20, description="代理类型: http/socks/tunnel")
platform = fields.CharField(max_length=50, default="all", description="目标平台: boss/qcwy/zhilian/all")
proxy_url = fields.CharField(max_length=255, description="代理地址")
is_active = fields.BooleanField(default=True, description="是否可用")
class Meta:
table = "proxy_config"
class ProxyProvider(BaseModel, TimestampMixin):
name = fields.CharField(max_length=100, description="名称")
platform = fields.CharField(max_length=50, default="all", description="目标平台: boss/qcwy/zhilian/all")
mode = fields.CharField(max_length=20, description="解析模式: json/text")
list_path = fields.CharField(max_length=255, null=True, description="JSON列表路径")
ip_path = fields.CharField(max_length=255, null=True, description="IP字段路径")
port_path = fields.CharField(max_length=255, null=True, description="端口字段路径")
username_path = fields.CharField(max_length=255, null=True, description="用户名字段路径")
password_path = fields.CharField(max_length=255, null=True, description="密码字段路径")
pattern = fields.TextField(null=True, description="文本解析正则")
template = fields.CharField(max_length=255, description="最终代理模板")
class Meta:
table = "proxy_provider"

19
app/models/enums.py Normal file
View File

@ -0,0 +1,19 @@
from enum import Enum, StrEnum
class EnumBase(Enum):
@classmethod
def get_member_values(cls):
return [item.value for item in cls._member_map_.values()]
@classmethod
def get_member_names(cls):
return [name for name in cls._member_names_]
class MethodType(StrEnum):
GET = "GET"
POST = "POST"
PUT = "PUT"
DELETE = "DELETE"
PATCH = "PATCH"

31
app/models/keyword.py Normal file
View File

@ -0,0 +1,31 @@
from tortoise import fields
from tortoise.models import Model
class BaseKeyword(Model):
id = fields.IntField(pk=True)
city = fields.CharField(max_length=64)
job = fields.CharField(max_length=128)
last_requested_date = fields.DateField(null=True)
last_requested_at = fields.DatetimeField(null=True)
created_at = fields.DatetimeField(auto_now_add=True)
updated_at = fields.DatetimeField(auto_now=True)
class Meta:
abstract = True
class BossKeyword(BaseKeyword):
class Meta:
table = "boss_keyword"
class QcwyKeyword(BaseKeyword):
class Meta:
table = "qcwy_keyword"
class ZhilianKeyword(BaseKeyword):
class Meta:
table = "zhilian_keyword"

35
app/models/metrics.py Normal file
View File

@ -0,0 +1,35 @@
from tortoise import fields
from tortoise.models import Model
class StatsTotal(Model):
id = fields.IntField(pk=True)
source = fields.CharField(max_length=32)
table_type = fields.CharField(max_length=32)
count = fields.IntField()
ts = fields.DatetimeField()
class IpUploadStats(Model):
id = fields.IntField(pk=True)
source = fields.CharField(max_length=32)
ip = fields.CharField(max_length=64)
date = fields.DateField()
upload_count = fields.IntField()
last_report_at = fields.DatetimeField(null=True)
status = fields.CharField(max_length=16, default="normal")
created_at = fields.DatetimeField(auto_now_add=True)
class Meta:
unique_together = ("source", "ip", "date")
class ScheduledTaskRun(Model):
id = fields.IntField(pk=True)
task_id = fields.CharField(max_length=64)
task_name = fields.CharField(max_length=64)
status = fields.CharField(max_length=32)
started_at = fields.DatetimeField()
finished_at = fields.DatetimeField()
duration_ms = fields.IntField()
error = fields.TextField(null=True)

17
app/models/token.py Normal file
View File

@ -0,0 +1,17 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from tortoise import fields
from app.models.base import BaseModel, TimestampMixin
class BossToken(BaseModel, TimestampMixin):
wt2 = fields.CharField(max_length=200, null=True, description="Boss直聘wt2")
mpt = fields.CharField(max_length=200, null=True, description="Boss直聘mpt")
is_active = fields.BooleanField(default=True, description="是否可用")
failed_count = fields.IntField(default=0, description="失败次数")
last_used_time = fields.DatetimeField(null=True, description="最后使用时间")
class Meta:
table = "boss_token"
table_description = "Boss直聘token表"

View File

@ -0,0 +1,173 @@
import math
from collections.abc import Generator
from datetime import datetime
from typing import Optional, Dict, Any, List
from clickhouse_connect.driver import AsyncClient
from clickhouse_connect.driver.query import QueryResult
class ClickHouseBaseRepo:
"""ClickHouse基础仓库类"""
def __init__(self, clickhouse_client: AsyncClient, table_name: str):
self._clickhouse_client = clickhouse_client
self._table_name = table_name
async def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None) -> QueryResult:
"""执行查询"""
return await self._clickhouse_client.query(query, parameters=parameters)
async def execute_insert(self, data: List[Dict[str, Any]]) -> None:
"""批量插入数据"""
if not data:
return
columns = list(data[0].keys())
values = [[row[col] for col in columns] for row in data]
await self._clickhouse_client.insert(
table=self._table_name,
data=values,
column_names=columns
)
def _build_where_statements(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None
) -> tuple[List[str], Dict[str, Any]]:
"""构建WHERE条件语句"""
where_statements = []
params = {}
if filters:
for key, value in filters.items():
if value is not None:
where_statements.append(f"{key} = %({key})s")
params[key] = value
if from_dt:
where_statements.append("created_at >= %(from_dt)s")
params['from_dt'] = from_dt
if to_dt:
where_statements.append("created_at <= %(to_dt)s")
params['to_dt'] = to_dt
return where_statements, params
class JobAnalyticsRepo(ClickHouseBaseRepo):
"""招聘数据分析仓库"""
def __init__(self, clickhouse_client: AsyncClient):
super().__init__(clickhouse_client, "job_analytics")
async def get_job_count(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> int:
"""获取职位数量"""
where_statements, params = self._build_where_statements(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
query = f"""
SELECT COUNT(*)
FROM {self._table_name}
WHERE {where_clause}
"""
result = await self.execute_query(query, parameters=params)
return int(result.result_rows[0][0])
async def group_jobs_by_column(
self,
group_by_column: str,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
limit: int = 10
) -> List[Dict[str, Any]]:
"""按指定列分组统计职位"""
where_statements, params = self._build_where_statements(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
query = f"""
SELECT
{group_by_column} AS category,
COUNT(*) AS job_count
FROM {self._table_name}
WHERE {where_clause}
GROUP BY {group_by_column}
ORDER BY job_count DESC
LIMIT {limit}
"""
result = await self.execute_query(query, parameters=params)
return [
{"category": row[0], "job_count": int(row[1])}
for row in result.result_rows
]
async def get_volume_trend(
self,
interval: str = "day", # day or hour
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> List[Dict[str, Any]]:
"""获取数据量趋势"""
where_statements, params = self._build_where_statements(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
if interval == "day":
time_func = "toStartOfDay"
elif interval == "hour":
time_func = "toStartOfHour"
elif interval == "week":
time_func = "toStartOfWeek"
elif interval == "month":
time_func = "toStartOfMonth"
else:
time_func = "toStartOfDay"
# 使用 toTimeZone 确保聚合按北京时间划分
query = f"""
SELECT
{time_func}(toTimeZone(created_at, 'Asia/Shanghai')) AS time_bucket,
source,
COUNT(*) AS count
FROM {self._table_name}
WHERE {where_clause}
GROUP BY time_bucket, source
ORDER BY time_bucket ASC
"""
result = await self.execute_query(query, parameters=params)
return [
{
"time": row[0].isoformat(),
"source": row[1],
"count": int(row[2])
}
for row in result.result_rows
]
async def get_source_distribution(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> List[Dict[str, Any]]:
"""获取数据来源分布"""
return await self.group_jobs_by_column("source", filters, from_dt, to_dt)

1
app/schemas/__init__.py Normal file
View File

@ -0,0 +1 @@
from .base import *

76
app/schemas/analytics.py Normal file
View File

@ -0,0 +1,76 @@
from datetime import datetime
from typing import Optional, Dict, Any, List
from pydantic import BaseModel, Field
class AnalyticsQueryParams(BaseModel):
"""分析查询参数"""
from_date: Optional[datetime] = Field(None, description="开始日期")
to_date: Optional[datetime] = Field(None, description="结束日期")
city: Optional[str] = Field(None, description="城市筛选")
company_name: Optional[str] = Field(None, description="公司名称筛选")
position_name: Optional[str] = Field(None, description="职位名称筛选")
industry: Optional[str] = Field(None, description="行业筛选")
experience_required: Optional[str] = Field(None, description="经验要求筛选")
limit: int = Field(10, ge=1, le=100, description="返回结果数量限制")
class SalaryStatistics(BaseModel):
"""薪资统计信息"""
avg_salary_min: float = Field(description="最低薪资平均值")
avg_salary_max: float = Field(description="最高薪资平均值")
min_salary: float = Field(description="最低薪资")
max_salary: float = Field(description="最高薪资")
total_jobs: int = Field(description="有薪资信息的职位总数")
class JobStatisticsResponse(BaseModel):
"""职位统计响应"""
total_jobs: int = Field(description="职位总数")
period: Dict[str, Optional[str]] = Field(description="统计时间段")
class CategoryCount(BaseModel):
"""分类统计"""
category: str = Field(description="分类名称")
job_count: int = Field(description="职位数量")
class TopCompaniesResponse(BaseModel):
"""热门公司响应"""
companies: List[CategoryCount] = Field(description="公司列表")
total_count: int = Field(description="总数")
class TopPositionsResponse(BaseModel):
"""热门职位响应"""
positions: List[CategoryCount] = Field(description="职位列表")
total_count: int = Field(description="总数")
class CityDistributionResponse(BaseModel):
"""城市分布响应"""
cities: List[CategoryCount] = Field(description="城市列表")
total_count: int = Field(description="总数")
class IndustryDistributionResponse(BaseModel):
"""行业分布响应"""
industries: List[CategoryCount] = Field(description="行业列表")
total_count: int = Field(description="总数")
class ExperienceDistributionResponse(BaseModel):
"""经验要求分布响应"""
experience_levels: List[CategoryCount] = Field(description="经验要求列表")
total_count: int = Field(description="总数")
class AnalyticsDashboardResponse(BaseModel):
"""分析仪表板响应"""
job_statistics: JobStatisticsResponse = Field(description="职位统计")
top_companies: List[CategoryCount] = Field(description="热门公司")
top_positions: List[CategoryCount] = Field(description="热门职位")
city_distribution: List[CategoryCount] = Field(description="城市分布")
industry_distribution: List[CategoryCount] = Field(description="行业分布")
experience_distribution: List[CategoryCount] = Field(description="经验要求分布")

17
app/schemas/apis.py Normal file
View File

@ -0,0 +1,17 @@
from pydantic import BaseModel, Field
from app.models.enums import MethodType
class BaseApi(BaseModel):
path: str = Field(..., description="API路径", example="/api/v1/user/list")
summary: str = Field("", description="API简介", example="查看用户列表")
method: MethodType = Field(..., description="API方法", example="GET")
tags: str = Field(..., description="API标签", example="User")
class ApiCreate(BaseApi): ...
class ApiUpdate(BaseApi):
id: int

56
app/schemas/base.py Normal file
View File

@ -0,0 +1,56 @@
from typing import Any, Optional
from fastapi.encoders import jsonable_encoder
from fastapi.responses import JSONResponse
class Success(JSONResponse):
def __init__(
self,
code: int = 200,
msg: Optional[str] = "OK",
data: Optional[Any] = None,
**kwargs,
):
content = {"code": code, "msg": msg, "data": data}
content.update(kwargs)
encoded_content = jsonable_encoder(content)
super().__init__(content=encoded_content, status_code=code)
class Fail(JSONResponse):
def __init__(
self,
code: int = 400,
msg: Optional[str] = None,
data: Optional[Any] = None,
**kwargs,
):
content = {"code": code, "msg": msg, "data": data}
content.update(kwargs)
encoded_content = jsonable_encoder(content)
super().__init__(content=encoded_content, status_code=code)
class SuccessExtra(JSONResponse):
def __init__(
self,
code: int = 200,
msg: Optional[str] = None,
data: Optional[Any] = None,
total: int = 0,
page: int = 1,
page_size: int = 20,
**kwargs,
):
content = {
"code": code,
"msg": msg,
"data": data,
"total": total,
"page": page,
"page_size": page_size,
}
content.update(kwargs)
encoded_content = jsonable_encoder(content)
super().__init__(content=encoded_content, status_code=code)

29
app/schemas/cleaning.py Normal file
View File

@ -0,0 +1,29 @@
from datetime import datetime
from typing import Optional, Any
from pydantic import BaseModel, Field
class CleaningTaskBase(BaseModel):
target: str = Field(description="目标(URL/公司名/ID)")
clean_type: str = Field("auto", description="清洗模式")
platform: str = Field("auto", description="目标平台")
proxy: Optional[str] = Field(None, description="代理地址")
status: str = Field("pending", description="状态: pending/processing/success/fail")
storage_status: str = Field("unknown", description="存储状态: saved/duplicate/failed/unknown")
remote_sent: bool = Field(False, description="是否已远程推送")
result_summary: Optional[Any] = Field(None, description="清洗结果摘要")
original_data: Optional[Any] = Field(None, description="原始请求数据")
error_msg: Optional[str] = Field(None, description="错误信息")
class CleaningTaskCreate(CleaningTaskBase):
pass
class CleaningTaskUpdate(CleaningTaskBase):
pass
class CleaningTaskOut(CleaningTaskBase):
id: int
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True

18
app/schemas/depts.py Normal file
View File

@ -0,0 +1,18 @@
from pydantic import BaseModel, Field
class BaseDept(BaseModel):
name: str = Field(..., description="部门名称", example="研发中心")
desc: str = Field("", description="备注", example="研发中心")
order: int = Field(0, description="排序")
parent_id: int = Field(0, description="父部门ID")
class DeptCreate(BaseDept): ...
class DeptUpdate(BaseDept):
id: int
def update_dict(self):
return self.model_dump(exclude_unset=True, exclude={"id"})

29
app/schemas/keyword.py Normal file
View File

@ -0,0 +1,29 @@
from datetime import date, datetime
from typing import Optional
from pydantic import BaseModel, Field
class KeywordBase(BaseModel):
city: str = Field(..., max_length=64, description="城市")
job: str = Field(..., max_length=128, description="职位关键词")
class KeywordCreate(KeywordBase):
pass
class KeywordUpdate(BaseModel):
city: Optional[str] = Field(None, max_length=64, description="城市")
job: Optional[str] = Field(None, max_length=128, description="职位关键词")
class KeywordOut(KeywordBase):
id: int
last_requested_date: Optional[date] = None
last_requested_at: Optional[datetime] = None
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True

20
app/schemas/login.py Normal file
View File

@ -0,0 +1,20 @@
from datetime import datetime
from pydantic import BaseModel, Field
class CredentialsSchema(BaseModel):
username: str = Field(..., description="用户名称", example="admin")
password: str = Field(..., description="密码", example="123456")
class JWTOut(BaseModel):
access_token: str
username: str
class JWTPayload(BaseModel):
user_id: int
username: str
is_superuser: bool
exp: datetime

52
app/schemas/menus.py Normal file
View File

@ -0,0 +1,52 @@
from enum import StrEnum
from typing import Optional
from pydantic import BaseModel, Field
class MenuType(StrEnum):
CATALOG = "catalog" # 目录
MENU = "menu" # 菜单
class BaseMenu(BaseModel):
id: int
name: str
path: str
remark: Optional[dict]
menu_type: Optional[MenuType]
icon: Optional[str]
order: int
parent_id: int
is_hidden: bool
component: str
keepalive: bool
redirect: Optional[str]
children: Optional[list["BaseMenu"]]
class MenuCreate(BaseModel):
menu_type: MenuType = Field(default=MenuType.CATALOG.value)
name: str = Field(example="用户管理")
icon: Optional[str] = "ph:user-list-bold"
path: str = Field(example="/system/user")
order: Optional[int] = Field(example=1)
parent_id: Optional[int] = Field(example=0, default=0)
is_hidden: Optional[bool] = False
component: str = Field(default="Layout", example="/system/user")
keepalive: Optional[bool] = True
redirect: Optional[str] = ""
class MenuUpdate(BaseModel):
id: int
menu_type: Optional[MenuType] = Field(example=MenuType.CATALOG.value)
name: Optional[str] = Field(example="用户管理")
icon: Optional[str] = "ph:user-list-bold"
path: Optional[str] = Field(example="/system/user")
order: Optional[int] = Field(example=1)
parent_id: Optional[int] = Field(example=0)
is_hidden: Optional[bool] = False
component: str = Field(example="/system/user")
keepalive: Optional[bool] = False
redirect: Optional[str] = ""

33
app/schemas/proxy.py Normal file
View File

@ -0,0 +1,33 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class ProxyConfigBase(BaseModel):
name: str = Field(description="名称")
proxy_type: str = Field(description="代理类型: http/socks/tunnel")
platform: str = Field("all", description="目标平台: boss/qcwy/zhilian/all")
proxy_url: str = Field(description="代理地址")
is_active: bool = Field(True, description="是否可用")
class ProxyConfigCreate(ProxyConfigBase):
pass
class ProxyConfigUpdate(BaseModel):
id: int = Field(description="主键ID")
name: Optional[str] = Field(None, description="名称")
proxy_type: Optional[str] = Field(None, description="代理类型: http/socks/tunnel")
platform: Optional[str] = Field(None, description="目标平台: boss/qcwy/zhilian/all")
proxy_url: Optional[str] = Field(None, description="代理地址")
is_active: Optional[bool] = Field(None, description="是否可用")
class ProxyConfigOut(ProxyConfigBase):
id: int
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True

View File

@ -0,0 +1,45 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class ProxyProviderBase(BaseModel):
name: str = Field(description="名称")
platform: str = Field("all", description="目标平台: boss/qcwy/zhilian/all")
mode: str = Field("json", description="解析模式: json/text")
list_path: Optional[str] = Field(None, description="JSON列表路径")
ip_path: Optional[str] = Field(None, description="IP字段路径")
port_path: Optional[str] = Field(None, description="端口字段路径")
username_path: Optional[str] = Field(None, description="用户名字段路径")
password_path: Optional[str] = Field(None, description="密码字段路径")
pattern: Optional[str] = Field(None, description="文本解析正则")
template: str = Field(description="最终代理模板")
class ProxyProviderCreate(ProxyProviderBase):
pass
class ProxyProviderUpdate(BaseModel):
id: int = Field(description="主键ID")
name: Optional[str] = Field(None, description="名称")
platform: Optional[str] = Field(None, description="目标平台: boss/qcwy/zhilian/all")
mode: Optional[str] = Field(None, description="解析模式: json/text")
list_path: Optional[str] = Field(None, description="JSON列表路径")
ip_path: Optional[str] = Field(None, description="IP字段路径")
port_path: Optional[str] = Field(None, description="端口字段路径")
username_path: Optional[str] = Field(None, description="用户名字段路径")
password_path: Optional[str] = Field(None, description="密码字段路径")
pattern: Optional[str] = Field(None, description="文本解析正则")
template: Optional[str] = Field(None, description="最终代理模板")
class ProxyProviderOut(ProxyProviderBase):
id: int
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True

32
app/schemas/roles.py Normal file
View File

@ -0,0 +1,32 @@
from datetime import datetime
from typing import Optional
from pydantic import BaseModel, Field
class BaseRole(BaseModel):
id: int
name: str
desc: str = ""
users: Optional[list] = []
menus: Optional[list] = []
apis: Optional[list] = []
created_at: Optional[datetime]
updated_at: Optional[datetime]
class RoleCreate(BaseModel):
name: str = Field(example="管理员")
desc: str = Field("", example="管理员角色")
class RoleUpdate(BaseModel):
id: int = Field(example=1)
name: str = Field(example="管理员")
desc: str = Field("", example="管理员角色")
class RoleUpdateMenusApis(BaseModel):
id: int
menu_ids: list[int] = []
api_infos: list[dict] = []

36
app/schemas/token.py Normal file
View File

@ -0,0 +1,36 @@
from pydantic import BaseModel, Field
from datetime import datetime
from typing import Optional, List, Dict, Any
class BossTokenCreate(BaseModel):
wt2: str = Field(..., description="Boss直聘wt2")
mpt: str = Field(..., description="Boss直聘mpt")
is_active: bool = Field(True, description="是否可用")
class BossTokenUpdate(BaseModel):
wt2: Optional[str] = Field(None, description="Boss直聘wt2")
mpt: Optional[str] = Field(None, description="Boss直聘mpt")
is_active: Optional[bool] = Field(None, description="是否可用")
failed_count: Optional[int] = Field(None, description="失败次数")
last_used_time: Optional[datetime] = Field(None, description="最后使用时间")
class BossTokenResponse(BaseModel):
id: int
wt2: Optional[str]
mpt: Optional[str]
is_active: bool
failed_count: int
last_used_time: Optional[datetime]
created_at: datetime
updated_at: datetime
class Config:
from_attributes = True
class BossTokenList(BaseModel):
total: int
items: List[BossTokenResponse]

44
app/schemas/users.py Normal file
View File

@ -0,0 +1,44 @@
from datetime import datetime
from typing import List, Optional
from pydantic import BaseModel, EmailStr, Field
class BaseUser(BaseModel):
id: int
email: Optional[EmailStr] = None
username: Optional[str] = None
is_active: Optional[bool] = True
is_superuser: Optional[bool] = False
created_at: Optional[datetime]
updated_at: Optional[datetime]
last_login: Optional[datetime]
roles: Optional[list] = []
class UserCreate(BaseModel):
email: EmailStr = Field(example="admin@qq.com")
username: str = Field(example="admin")
password: str = Field(example="123456")
is_active: Optional[bool] = True
is_superuser: Optional[bool] = False
role_ids: Optional[List[int]] = []
dept_id: Optional[int] = Field(0, description="部门ID")
def create_dict(self):
return self.model_dump(exclude_unset=True, exclude={"role_ids"})
class UserUpdate(BaseModel):
id: int
email: EmailStr
username: str
is_active: Optional[bool] = True
is_superuser: Optional[bool] = False
role_ids: Optional[List[int]] = []
dept_id: Optional[int] = 0
class UpdatePassword(BaseModel):
old_password: str = Field(description="旧密码")
new_password: str = Field(description="新密码")

View File

@ -0,0 +1,58 @@
from datetime import datetime
from typing import Optional, Dict, Any, List
from clickhouse_connect.driver import AsyncClient
from app.repositories.clickhouse_repo import JobAnalyticsRepo
class AnalyticsService:
"""数据分析服务"""
def __init__(self, clickhouse_client: AsyncClient):
self.job_repo = JobAnalyticsRepo(clickhouse_client)
async def get_job_statistics(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> Dict[str, Any]:
"""获取职位统计信息(仅返回总量)"""
total_jobs = await self.job_repo.get_job_count(
filters=filters, from_dt=from_dt, to_dt=to_dt
)
return {
"total_jobs": total_jobs,
"period": {
"from_date": from_dt.isoformat() if from_dt else None,
"to_date": to_dt.isoformat() if to_dt else None
}
}
async def get_volume_trend(
self,
interval: str = "day",
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> List[Dict[str, Any]]:
"""获取数据量趋势"""
return await self.job_repo.get_volume_trend(
interval=interval,
filters=filters,
from_dt=from_dt,
to_dt=to_dt
)
async def get_source_distribution(
self,
filters: Optional[Dict[str, Any]] = None,
from_dt: Optional[datetime] = None,
to_dt: Optional[datetime] = None,
) -> List[Dict[str, Any]]:
"""获取数据来源分布"""
return await self.job_repo.get_source_distribution(
filters=filters,
from_dt=from_dt,
to_dt=to_dt
)

363
app/services/cleaning.py Normal file
View File

@ -0,0 +1,363 @@
import csv
import io
import re
from typing import List, Dict, Any, Union, Optional
from fastapi import UploadFile
from loguru import logger
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
from app.services.job import DataRouterService, DataType, PlatformType
from app.core.clickhouse import clickhouse_manager
from app.models.token import BossToken
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
class CleaningService:
def __init__(self):
self.boss_service = BossService()
self.qcwy_service = QcwyService()
self.zhilian_service = ZhilianService()
self.data_router = None
self._boss_token_loaded = False
def _apply_proxy(self, proxy: Optional[str]) -> None:
self.boss_service.set_proxy(proxy)
self.qcwy_service.set_proxy(proxy)
self.zhilian_service.set_proxy(proxy)
async def _ensure_boss_token_loaded(self) -> None:
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if not token_obj:
logger.warning("BossToken not found or inactive")
return
self.boss_service.set_login_data(token_obj.mpt or "", "")
self._boss_token_loaded = True
async def get_data_router(self) -> DataRouterService:
if not self.data_router:
client = await clickhouse_manager.get_client()
self.data_router = DataRouterService(client)
return self.data_router
async def parse_file(self, file: UploadFile) -> List[str]:
content = await file.read()
filename = file.filename
targets = []
if filename.endswith('.csv'):
text = content.decode('utf-8')
# Handle BOM
if text.startswith('\uFEFF'):
text = text[1:]
reader = csv.reader(io.StringIO(text))
for row in reader:
if row:
targets.append(row[0].strip())
else:
text = content.decode('utf-8')
targets = [line.strip() for line in text.splitlines() if line.strip()]
return [t for t in targets if t]
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
try:
await self._ensure_boss_token_loaded()
self._apply_proxy(proxy)
result = None
if clean_type == "auto":
result = await self.clean_target_auto(target)
elif clean_type == "clean_url":
if platform == "auto":
result = await self.clean_target_auto(target)
elif platform == "boss":
result = await self._process_boss_url(target)
elif platform == "qcwy":
result = await self._process_qcwy_url(target)
elif platform == "zhilian":
result = await self._process_zhilian_url(target)
elif clean_type == "job_id":
result = await self.clean_by_job_id(target, platform)
elif clean_type == "company_name":
result = await self.clean_by_company_name(target, platform)
elif clean_type == "company_id":
result = await self.clean_by_company_id(target, platform)
elif clean_type == "company_jobs":
if platform == "boss":
result = await self.clean_boss_company_jobs(target)
elif platform == "qcwy":
result = await self.clean_qcwy_company_jobs(target)
elif platform == "zhilian":
result = await self.clean_zhilian_company_jobs(target)
if not result:
return {
"success": False,
"target": target,
"error": "No data found or operation failed",
"storage_status": "failed",
"remote_sent": False
}
# Normalize result if it's just a dict (from store_data)
# If it's a boolean (from some legacy paths), wrap it
if isinstance(result, bool):
return {
"success": result,
"target": target,
"error": None if result else "Operation failed",
"storage_status": "unknown",
"remote_sent": False
}
# If it's the dict returned by DataRouterService.store_data
return {
"success": result.get("success", False),
"target": target,
"error": result.get("message") if not result.get("success") else None,
"storage_status": "duplicate" if result.get("duplicate") else "saved",
"remote_sent": result.get("remote_sent", False),
"data_summary": result.get("data_summary"), # Optional: summary of data
"original_data": result.get("original_data")
}
except Exception as e:
logger.error(f"Error processing item {target}: {e}")
return {
"success": False,
"target": target,
"error": str(e),
"storage_status": "error",
"remote_sent": False
}
async def clean_target_auto(self, target: str) -> Union[bool, Dict[str, Any]]:
if "zhipin.com" in target:
return await self._process_boss_url(target)
elif "51job.com" in target:
return await self._process_qcwy_url(target)
elif "zhaopin.com" in target:
return await self._process_zhilian_url(target)
return await self._process_search_company(target)
async def clean_by_job_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
data = None
result = None
# Try to extract ID if target looks like a URL
if platform == "boss":
match = re.search(r'job_detail/([^.]+)\.html', target)
if match:
target = match.group(1)
elif platform == "qcwy":
match = re.search(r'/(\d+)\.html', target)
if match:
target = match.group(1)
elif platform == "zhilian":
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
if match:
target = match.group(1)
if platform == "boss":
data = self.boss_service.get_job_detail_by_id(target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.BOSS)
elif platform == "qcwy":
data = self.qcwy_service.get_job_detail(target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.QCWY)
elif platform == "zhilian":
data = self.zhilian_service.get_job_detail(target)
if data:
result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN)
if result and isinstance(result, dict) and data:
result['original_data'] = data
return result
return False
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
if platform == "boss":
res = self.boss_service.search_jobs(target)
if res and res.get('zpData') and res['zpData'].get('list'):
# For company name search, we might get multiple jobs.
# Currently we just return the result of the LAST one for simplicity in status reporting,
# or we should change logic to handle list.
# For now, let's just process them and return the last result as indicative.
last_result = None
for job in res['zpData']['list']:
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
if last_result and isinstance(last_result, dict):
# For search results, we store the full search response as original data
last_result['original_data'] = res
return last_result if last_result else False
elif platform == "qcwy":
res = self.qcwy_service.search_jobs(target)
if res:
last_result = None
for job in res:
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
if last_result and isinstance(last_result, dict):
last_result['original_data'] = res
return last_result if last_result else False
elif platform == "zhilian":
res = self.zhilian_service.search_company_jobs_by_name(target)
if res and isinstance(res, dict):
data = res.get("data") or {}
items = data.get("list") or []
if not isinstance(items, list):
items = []
last_result = None
for job in items:
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = res
return last_result if last_result else False
return False
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
data = None
result = None
if platform == "boss":
data = self.boss_service.get_company_detail_by_id(target)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS)
elif platform == "qcwy":
company_id = target
match = re.match(r"^co(\d+)$", company_id)
if match:
company_id = match.group(1)
data = self.qcwy_service.get_company_info(company_id)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY)
elif platform == "zhilian":
data = self.zhilian_service.get_company_detail(target)
if data:
result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN)
if result and isinstance(result, dict) and data:
result['original_data'] = data
return result
return False
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
match = re.search(r'gongsi/([^.]+)\.html', target)
if match:
company_id = match.group(1)
data = self.boss_service.get_company_jobs_by_id(company_id)
if not data:
return False
jobs = []
zp_data = data.get("zpData") if isinstance(data, dict) else None
if isinstance(zp_data, dict):
if isinstance(zp_data.get("jobList"), list):
jobs = zp_data.get("jobList") or []
elif isinstance(zp_data.get("list"), list):
jobs = zp_data.get("list") or []
if not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
match = re.match(r'^co(\d+)$', company_id)
if match:
company_id = match.group(1)
data = self.qcwy_service.get_company_jobs_by_id(company_id)
if not data:
return False
jobs_list = qcwy_extract_items(data)
jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else []
if not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
router = await self.get_data_router()
company_id = target
data = self.zhilian_service.get_company_jobs_by_id(company_id)
if not data or not isinstance(data, dict):
return False
data_field = data.get("data") or {}
jobs = data_field.get("list") or []
if not isinstance(jobs, list) or not jobs:
return False
last_result: Optional[Dict[str, Any]] = None
for job in jobs:
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
if last_result and isinstance(last_result, dict):
last_result["original_data"] = data
return last_result
return False
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'job_detail/([^.]+)\.html', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "boss")
company_match = re.search(r'gongsi/([^.]+)\.html', url)
if company_match:
return await self.clean_by_company_id(company_match.group(1), "boss")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "boss")
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'/(\d+)\.html', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "qcwy")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "qcwy")
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
if job_match:
return await self.clean_by_job_id(job_match.group(1), "zhilian")
# Fallback: assume it's a job ID
return await self.clean_by_job_id(url, "zhilian")
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:
return await self.clean_by_company_name(name, "boss")

View File

@ -0,0 +1,645 @@
import asyncio
import json
import random
from datetime import datetime
from typing import Any, Dict, List, Optional
from loguru import logger
from app.core.clickhouse import clickhouse_manager
from app.models.token import BossToken
from app.services.crawler.boss import BossService
from app.services.crawler.qcwy import QcwyService
from app.services.crawler.zhilian import ZhilianService
class CompanyCleaner:
def __init__(self):
self.boss_service = BossService()
self.qcwy_service = QcwyService()
self.zhilian_service = ZhilianService()
self._boss_token_loaded = False
def _apply_proxy(self, proxy: Optional[str]) -> None:
self.boss_service.set_proxy(proxy)
self.qcwy_service.set_proxy(proxy)
self.zhilian_service.set_proxy(proxy)
async def _ensure_boss_token_loaded(self) -> None:
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
return
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
if not token_obj:
logger.warning("BossToken not found or inactive in CompanyCleaner")
return
self.boss_service.set_login_data(token_obj.mpt or "", "")
self._boss_token_loaded = True
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None):
client = await clickhouse_manager.get_client()
logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
if source is None or source == "zhilian":
await self._collect_zhilian(client, limit)
if source is None or source == "qcwy":
await self._collect_qcwy(client, limit)
if source is None or source == "boss":
await self._collect_boss(client, limit)
logger.info("Finished collecting pending companies.")
async def _collect_zhilian(self, client, limit: int):
logger.info("Collecting Zhilian companies...")
# 优化先获取已存在的公司ID避免在子查询中读取json_data
# 使用PREWHERE提前过滤时间范围减少需要读取的数据量
# 检查90天内已处理的公司避免重复请求
days_back_existing = 90 # 查询最近90天的数据避免重复请求已处理过的公司
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
FROM job_data.zhilian_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyNumber') != ''
LIMIT 50000
"""
# 添加重试机制
existing_result = None
existing_cids = set() # 默认使用空集合
for attempt in range(3):
try:
logger.info(f"Querying existing Zhilian companies (attempt {attempt+1})...")
existing_result = await client.query(existing_companies_query)
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
break
except Exception as e:
error_str = str(e).lower()
if "memory" in error_str or "memory_limit" in error_str:
if attempt == 0:
days_back_existing = 1
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
FROM job_data.zhilian_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyNumber') != ''
LIMIT 5000
"""
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
elif attempt == 1:
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
FROM job_data.zhilian_company SAMPLE 0.1
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyNumber') != ''
LIMIT 2000
"""
logger.warning(f"Memory error persists, using SAMPLE 0.1")
else:
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
logger.warning("Using empty set for existing_cids, continuing with collection...")
existing_cids = set()
break
else:
logger.error(f"Non-memory error while querying existing companies: {e}")
raise
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'zhilian'"
pending_result = await client.query(pending_query)
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
# 构建排除列表
exclude_cids = existing_cids | pending_cids
# 优化添加时间范围过滤只查询最近30天的数据减少扫描量
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'companyNumber') as cid,
JSONExtractString(json_data, 'companyName') as cname
FROM job_data.zhilian_job
PREWHERE created_at > now() - INTERVAL 30 DAY
WHERE json_data != ''
AND JSONExtractString(json_data, 'companyNumber') != ''
LIMIT {limit * 2}
"""
logger.info(f"Executing SQL for Zhilian (limit={limit * 2}): {query[:500]}...")
result = await client.query(query)
if not result.result_rows:
return
# 在 Python 中过滤掉已存在的和待处理的
rows: List[Dict[str, Any]] = []
for cid, cname in result.result_rows:
if not cid or cid in exclude_cids:
continue
if len(rows) >= limit:
break
rows.append(
{
"source": "zhilian",
"company_id": cid,
"company_name": cname,
"status": "pending",
"created_at": datetime.now(),
"updated_at": datetime.now(),
}
)
await self._insert_pending(client, rows)
logger.info(f"Added {len(rows)} Zhilian companies to pending.")
async def _collect_qcwy(self, client, limit: int):
logger.info("Collecting QCWY companies...")
# 优化先获取已存在的公司ID避免在子查询中读取json_data
# 使用PREWHERE提前过滤时间范围减少需要读取的数据量
# 检查90天内已处理的公司避免重复请求
days_back_existing = 90 # 查询最近90天的数据避免重复请求已处理过的公司
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
FROM job_data.qcwy_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyId') != ''
LIMIT 50000
"""
# 添加重试机制
existing_result = None
existing_cids = set() # 默认使用空集合
for attempt in range(3):
try:
logger.info(f"Querying existing QCWY companies (attempt {attempt+1})...")
existing_result = await client.query(existing_companies_query)
# 查询成功,提取结果
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
break
except Exception as e:
error_str = str(e).lower()
if "memory" in error_str or "memory_limit" in error_str:
if attempt == 0:
# 第一次失败:进一步减少时间范围
days_back_existing = 1
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
FROM job_data.qcwy_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyId') != ''
LIMIT 5000
"""
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
elif attempt == 1:
# 第二次失败:使用采样
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
FROM job_data.qcwy_company SAMPLE 0.1
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'companyId') != ''
LIMIT 2000
"""
logger.warning(f"Memory error persists, using SAMPLE 0.1")
else:
# 最后一次尝试也失败,使用空集合继续执行(避免阻塞整个流程)
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
logger.warning("Using empty set for existing_cids, continuing with collection...")
existing_cids = set()
break
else:
# 其他错误直接抛出
logger.error(f"Non-memory error while querying existing companies: {e}")
raise
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'qcwy'"
pending_result = await client.query(pending_query)
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
# 构建排除列表
exclude_cids = existing_cids | pending_cids
# 优化策略:
# 1. 减少时间范围从30天减少到7天大幅减少扫描的数据量
# 2. 减少LIMIT从limit*2减少到更小的值减少内存占用
# 3. 使用更严格的PREWHERE条件先过滤时间再过滤空json_data和超大JSON
# 4. 限制JSON大小过滤掉过大的json_data可能包含大量嵌套数据
# 5. 分批查询如果limit较大分批处理每次查询更少的数据
days_back = 7 # 从30天减少到7天减少扫描量
# 注意不使用length(json_data)检查,因为它需要读取整个列来计算长度
query_limit = min(limit * 2, 100) # 限制最大查询数量,避免内存超限
# 分批查询策略如果limit较大分批处理
result = None
for attempt in range(3): # 最多尝试3次
try:
# 根据尝试次数调整参数
if attempt == 1:
# 第一次失败后减少时间范围到3天
days_back = 3
query_limit = min(query_limit, 50)
logger.warning(f"Retry {attempt}: Reducing time range to {days_back} days and limit to {query_limit}")
elif attempt == 2:
# 第二次失败后:使用采样
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'coId') as cid,
JSONExtractString(json_data, 'companyName') as cname
FROM job_data.qcwy_job SAMPLE 0.1
PREWHERE created_at > now() - INTERVAL {days_back} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'coId') != ''
LIMIT {query_limit}
"""
logger.warning(f"Retry {attempt}: Using SAMPLE 0.1 to reduce memory usage")
result = await client.query(query)
break
# 正常查询或第一次重试
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'coId') as cid,
JSONExtractString(json_data, 'companyName') as cname
FROM job_data.qcwy_job
PREWHERE created_at > now() - INTERVAL {days_back} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'coId') != ''
LIMIT {query_limit}
"""
logger.info(f"Executing SQL for QCWY (limit={query_limit}, days={days_back}, attempt={attempt+1}): {query[:400]}...")
result = await client.query(query)
break
except Exception as e:
error_str = str(e).lower()
# 如果查询失败(可能是内存超限),继续重试
if "memory" in error_str or "memory_limit" in error_str:
if attempt < 2:
logger.warning(f"Memory error on attempt {attempt+1}: {e}")
continue
else:
# 最后一次尝试也失败,抛出异常
logger.error(f"Query failed after {attempt+1} attempts: {e}")
raise
else:
# 其他错误直接抛出
logger.error(f"Query failed with non-memory error: {e}")
raise
if not result or not result.result_rows:
logger.info("No QCWY companies found in query result.")
return
# 在 Python 中过滤掉已存在的和待处理的
rows: List[Dict[str, Any]] = []
for cid, cname in result.result_rows:
if not cid or cid in exclude_cids:
continue
if len(rows) >= limit:
break
rows.append(
{
"source": "qcwy",
"company_id": cid,
"company_name": cname,
"status": "pending",
"created_at": datetime.now(),
"updated_at": datetime.now(),
}
)
if rows:
await self._insert_pending(client, rows)
logger.info(f"Added {len(rows)} QCWY companies to pending.")
else:
logger.info("No new QCWY companies found after filtering.")
async def _collect_boss(self, client, limit: int):
logger.info("Collecting Boss companies...")
# 优化先获取已存在的公司ID避免在子查询中读取json_data
# 使用PREWHERE提前过滤时间范围减少需要读取的数据量
# 检查90天内已处理的公司避免重复请求
days_back_existing = 90 # 查询最近90天的数据避免重复请求已处理过的公司
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
FROM job_data.boss_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'brandId') != ''
LIMIT 50000
"""
# 添加重试机制
existing_result = None
existing_cids = set() # 默认使用空集合
for attempt in range(3):
try:
logger.info(f"Querying existing Boss companies (attempt {attempt+1})...")
existing_result = await client.query(existing_companies_query)
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
break
except Exception as e:
error_str = str(e).lower()
if "memory" in error_str or "memory_limit" in error_str:
if attempt == 0:
days_back_existing = 1
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
FROM job_data.boss_company
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'brandId') != ''
"""
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
elif attempt == 1:
existing_companies_query = f"""
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
FROM job_data.boss_company SAMPLE 0.1
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
AND json_data != ''
WHERE JSONExtractString(json_data, 'brandId') != ''
"""
logger.warning(f"Memory error persists, using SAMPLE 0.1")
else:
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
logger.warning("Using empty set for existing_cids, continuing with collection...")
existing_cids = set()
break
else:
logger.error(f"Non-memory error while querying existing companies: {e}")
raise
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'boss'"
pending_result = await client.query(pending_query)
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
# 构建排除列表
exclude_cids = existing_cids | pending_cids
# 优化添加时间范围过滤只查询最近30天的数据减少扫描量
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
query = f"""
SELECT DISTINCT
JSONExtractString(json_data, 'brandId') as cid,
JSONExtractString(json_data, 'brandName') as cname
FROM job_data.boss_job
PREWHERE created_at > now() - INTERVAL 30 DAY
WHERE json_data != ''
AND JSONExtractString(json_data, 'brandId') != ''
LIMIT {limit * 2}
"""
logger.info(f"Executing SQL for Boss (limit={limit * 2}): {query[:500]}...")
result = await client.query(query)
if not result.result_rows:
return
# 在 Python 中过滤掉已存在的和待处理的
rows: List[Dict[str, Any]] = []
for cid, cname in result.result_rows:
if not cid or cid in exclude_cids:
continue
if len(rows) >= limit:
break
rows.append(
{
"source": "boss",
"company_id": cid,
"company_name": cname,
"status": "pending",
"created_at": datetime.now(),
"updated_at": datetime.now(),
}
)
await self._insert_pending(client, rows)
logger.info(f"Added {len(rows)} Boss companies to pending.")
async def _insert_pending(self, client, rows: List[Dict[str, Any]]):
if not rows:
return
data: List[List[Any]] = []
for r in rows:
data.append(
[
r["source"],
r["company_id"],
r["company_name"],
r["status"],
"",
r["created_at"],
r["updated_at"],
1,
]
)
await client.insert(
"job_data.pending_company",
data,
column_names=[
"source",
"company_id",
"company_name",
"status",
"error_msg",
"created_at",
"updated_at",
"version",
],
)
async def process_single_company(
self,
source: str,
company_id: str,
proxy: Optional[str] = None,
max_delay_seconds: int = 5,
) -> Dict[str, Any]:
client = await clickhouse_manager.get_client()
if proxy:
self._apply_proxy(proxy)
delay = 0
if max_delay_seconds and max_delay_seconds > 0:
delay = random.randint(1, max_delay_seconds)
if delay > 0:
await asyncio.sleep(delay)
query = f"""
SELECT source, company_id, company_name, version
FROM job_data.pending_company
FINAL
WHERE source = '{source}' AND company_id = '{company_id}'
ORDER BY version DESC
LIMIT 1
"""
result = await client.query(query)
if result.result_rows:
source_value, cid, cname, version = result.result_rows[0]
else:
source_value = source
cid = company_id
cname = ""
version = 0
try:
success = await self._fetch_and_save(source_value, cid)
status = "done" if success else "failed"
error_msg = "" if success else "Fetch failed"
except Exception as e:
logger.error(f"Error processing {source_value} {cid}: {e}")
status = "failed"
error_msg = str(e)
await client.insert(
"job_data.pending_company",
[
[
source_value,
cid,
cname,
status,
error_msg.replace("'", "''"),
datetime.now(),
datetime.now(),
int(version) + 1,
]
],
column_names=[
"source",
"company_id",
"company_name",
"status",
"error_msg",
"created_at",
"updated_at",
"version",
],
)
return {
"success": status == "done",
"source": source_value,
"company_id": cid,
"company_name": cname,
"status": status,
"error_msg": error_msg,
"version": int(version) + 1,
}
async def process_pending_companies(
self,
limit: int = 100,
source: Optional[str] = None,
proxy: Optional[str] = None,
max_delay_seconds: int = 0,
):
client = await clickhouse_manager.get_client()
logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
if proxy:
self._apply_proxy(proxy)
where_clause = "WHERE status = 'pending'"
if source:
where_clause += f" AND source = '{source}'"
query = f"""
SELECT source, company_id, company_name, version
FROM job_data.pending_company
FINAL
{where_clause}
ORDER BY created_at ASC
LIMIT {limit}
"""
result = await client.query(query)
if not result.result_rows:
logger.info("No pending companies to process.")
return
for source_value, cid, cname, version in result.result_rows:
logger.info(f"Processing {source_value} company: {cname} ({cid})")
try:
if max_delay_seconds and max_delay_seconds > 0:
delay = random.randint(1, max_delay_seconds)
if delay > 0:
await asyncio.sleep(delay)
success = await self._fetch_and_save(source_value, cid)
status = "done" if success else "failed"
error_msg = "" if success else "Fetch failed"
except Exception as e:
logger.error(f"Error processing {source_value} {cid}: {e}")
status = "failed"
error_msg = str(e)
await client.insert(
"job_data.pending_company",
[
[
source_value,
cid,
cname,
status,
error_msg.replace("'", "''"),
datetime.now(),
datetime.now(),
int(version) + 1,
]
],
column_names=[
"source",
"company_id",
"company_name",
"status",
"error_msg",
"created_at",
"updated_at",
"version",
],
)
async def _fetch_and_save(self, source: str, company_id: str) -> bool:
data: Optional[Dict[str, Any]] = None
target_table = ""
if source == "zhilian":
data = self.zhilian_service.get_company_detail(company_id)
target_table = "zhilian_company"
elif source == "qcwy":
data = self.qcwy_service.get_company_info(company_id)
target_table = "qcwy_company"
elif source == "boss":
await self._ensure_boss_token_loaded()
data = self.boss_service.get_company_detail_by_id(company_id)
target_table = "boss_company"
if not data:
logger.error(f"No data returned from source={source} company_id={company_id}")
return False
try:
logger.info(
f"Raw company data from source={source} company_id={company_id}: "
f"{json.dumps(data, ensure_ascii=False)[:2000]}"
)
except Exception as e:
logger.error(f"Failed to log raw company data for source={source} company_id={company_id}: {e}")
client = await clickhouse_manager.get_client()
name = ""
if source == "zhilian":
name = data.get("companyBase", {}).get("companyName", "")
elif source == "qcwy":
name = data.get("companyName", "")
elif source == "boss":
name = data.get("name", "")
json_str = json.dumps(data, ensure_ascii=False)
await client.insert(
f"job_data.{target_table}",
[[0, json_str, name, datetime.now(), datetime.now()]],
column_names=["id", "json_data", "company_name", "created_at", "updated_at"],
)
return True
async def cleanup_old_records(self):
""" 清理已完成或失败的记录 (每日调用) """
client = await clickhouse_manager.get_client()
logger.info("Starting cleanup of processed pending companies...")
# ClickHouse mutations are async, but lightweight for this purpose
query = "ALTER TABLE job_data.pending_company DELETE WHERE status IN ('done', 'failed')"
try:
await client.command(query)
logger.info("Cleanup command executed successfully.")
except Exception as e:
logger.error(f"Cleanup failed: {e}")
company_cleaner = CompanyCleaner()

Some files were not shown because too many files have changed in this diff Show More