feat: 优化公司数据去重逻辑,扩大检查范围到90天
This commit is contained in:
commit
59bfefff0e
4
.dockerignore
Normal file
4
.dockerignore
Normal file
@ -0,0 +1,4 @@
|
||||
web/node_modules
|
||||
video
|
||||
local_deploy
|
||||
clickhouse_data
|
||||
1
.gitattributes
vendored
Normal file
1
.gitattributes
vendored
Normal file
@ -0,0 +1 @@
|
||||
*.html linguist-language=python
|
||||
21
.gitignore
vendored
Normal file
21
.gitignore
vendored
Normal file
@ -0,0 +1,21 @@
|
||||
__pycache__/
|
||||
.idea/
|
||||
venv/
|
||||
.venv/
|
||||
.mypy_cache/
|
||||
.vscode
|
||||
.ruff_cache/
|
||||
.pytest_cache/
|
||||
migrations/
|
||||
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
db.sqlite3-shm
|
||||
db.sqlite3-wal
|
||||
|
||||
.DS_Store
|
||||
._.DS_Store
|
||||
clickhouse_data
|
||||
data
|
||||
videos
|
||||
videps
|
||||
39
Dockerfile
Normal file
39
Dockerfile
Normal file
@ -0,0 +1,39 @@
|
||||
FROM node:18-alpine AS web
|
||||
|
||||
WORKDIR /opt/vue-fastapi-admin
|
||||
COPY /web ./web
|
||||
|
||||
# 安装pnpm并设置配置
|
||||
RUN npm install -g pnpm && \
|
||||
cd /opt/vue-fastapi-admin/web && \
|
||||
pnpm config set registry https://registry.npmmirror.com && \
|
||||
pnpm install && \
|
||||
pnpm run build
|
||||
|
||||
|
||||
FROM python:3.11-slim-bullseye
|
||||
|
||||
WORKDIR /opt/vue-fastapi-admin
|
||||
ADD . .
|
||||
COPY /deploy/entrypoint.sh .
|
||||
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=core-apt \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked,id=core-apt \
|
||||
sed -i "s@http://.*.debian.org@http://mirrors.ustc.edu.cn@g" /etc/apt/sources.list \
|
||||
&& rm -f /etc/apt/apt.conf.d/docker-clean \
|
||||
&& ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||
&& echo "Asia/Shanghai" > /etc/timezone \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends gcc python3-dev bash nginx vim curl procps net-tools
|
||||
|
||||
RUN pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
COPY --from=web /opt/vue-fastapi-admin/web/dist /opt/vue-fastapi-admin/web/dist
|
||||
ADD /deploy/web.conf /etc/nginx/sites-available/web.conf
|
||||
RUN rm -f /etc/nginx/sites-enabled/default \
|
||||
&& ln -s /etc/nginx/sites-available/web.conf /etc/nginx/sites-enabled/
|
||||
|
||||
ENV LANG=zh_CN.UTF-8
|
||||
EXPOSE 80
|
||||
|
||||
ENTRYPOINT [ "sh", "entrypoint.sh" ]
|
||||
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2023 mizhexiaoxiao
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
87
Makefile
Normal file
87
Makefile
Normal file
@ -0,0 +1,87 @@
|
||||
# Build configuration
|
||||
# -------------------
|
||||
|
||||
APP_NAME := `sed -n 's/^ *name.*=.*"\([^"]*\)".*/\1/p' pyproject.toml`
|
||||
APP_VERSION := `sed -n 's/^ *version.*=.*"\([^"]*\)".*/\1/p' pyproject.toml`
|
||||
GIT_REVISION = `git rev-parse HEAD`
|
||||
|
||||
# Introspection targets
|
||||
# ---------------------
|
||||
|
||||
.PHONY: help
|
||||
help: header targets
|
||||
|
||||
.PHONY: header
|
||||
header:
|
||||
@echo "\033[34mEnvironment\033[0m"
|
||||
@echo "\033[34m---------------------------------------------------------------\033[0m"
|
||||
@printf "\033[33m%-23s\033[0m" "APP_NAME"
|
||||
@printf "\033[35m%s\033[0m" $(APP_NAME)
|
||||
@echo ""
|
||||
@printf "\033[33m%-23s\033[0m" "APP_VERSION"
|
||||
@printf "\033[35m%s\033[0m" $(APP_VERSION)
|
||||
@echo ""
|
||||
@printf "\033[33m%-23s\033[0m" "GIT_REVISION"
|
||||
@printf "\033[35m%s\033[0m" $(GIT_REVISION)
|
||||
@echo "\n"
|
||||
|
||||
.PHONY: targets
|
||||
targets:
|
||||
@echo "\033[34mDevelopment Targets\033[0m"
|
||||
@echo "\033[34m---------------------------------------------------------------\033[0m"
|
||||
@perl -nle'print $& if m{^[a-zA-Z_-]+:.*?## .*$$}' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-22s\033[0m %s\n", $$1, $$2}'
|
||||
|
||||
# Development targets
|
||||
# -------------
|
||||
|
||||
.PHONY: install
|
||||
install: ## Install dependencies
|
||||
uv add pyproject.toml
|
||||
|
||||
|
||||
.PHONY: run
|
||||
run: start
|
||||
|
||||
.PHONY: start
|
||||
start: ## Starts the server
|
||||
python run.py
|
||||
|
||||
# Check, lint and format targets
|
||||
# ------------------------------
|
||||
|
||||
.PHONY: check
|
||||
check: check-format lint
|
||||
|
||||
.PHONY: check-format
|
||||
check-format: ## Dry-run code formatter
|
||||
black ./ --check
|
||||
isort ./ --profile black --check
|
||||
|
||||
.PHONY: lint
|
||||
lint: ## Run ruff
|
||||
ruff check ./app
|
||||
|
||||
.PHONY: format
|
||||
format: ## Run code formatter
|
||||
black ./
|
||||
isort ./ --profile black
|
||||
|
||||
|
||||
.PHONY: test
|
||||
test: ## Run the test suite
|
||||
$(eval include .env)
|
||||
$(eval export $(sh sed 's/=.*//' .env))
|
||||
pytest -vv -s --cache-clear ./
|
||||
|
||||
.PHONY: clean-db
|
||||
clean-db: ## 删除migrations文件夹和db.sqlite3
|
||||
find . -type d -name "migrations" -exec rm -rf {} +
|
||||
rm -f db.sqlite3 db.sqlite3-shm db.sqlite3-wal
|
||||
|
||||
.PHONY: migrate
|
||||
migrate: ## 运行aerich migrate命令生成迁移文件
|
||||
aerich migrate
|
||||
|
||||
.PHONY: upgrade
|
||||
upgrade: ## 运行aerich upgrade命令应用迁移
|
||||
aerich upgrade
|
||||
74
Pipfile
Normal file
74
Pipfile
Normal file
@ -0,0 +1,74 @@
|
||||
[[source]]
|
||||
url = "https://pypi.doubanio.com/simple"
|
||||
verify_ssl = true
|
||||
name = "pip_conf_index_global"
|
||||
|
||||
[packages]
|
||||
aerich = "==0.8.1"
|
||||
aiosqlite = "==0.20.0"
|
||||
annotated-types = "==0.7.0"
|
||||
anyio = "==4.8.0"
|
||||
argon2-cffi = "==23.1.0"
|
||||
argon2-cffi-bindings = "==21.2.0"
|
||||
asyncclick = "==8.1.8"
|
||||
black = "==24.10.0"
|
||||
certifi = "==2024.12.14"
|
||||
cffi = "==1.17.1"
|
||||
click = "==8.1.8"
|
||||
dictdiffer = "==0.9.0"
|
||||
dnspython = "==2.7.0"
|
||||
email-validator = "==2.2.0"
|
||||
fastapi = "==0.111.0"
|
||||
fastapi-cli = "==0.0.7"
|
||||
h11 = "==0.14.0"
|
||||
httpcore = "==1.0.7"
|
||||
httptools = "==0.6.4"
|
||||
httpx = "==0.28.1"
|
||||
idna = "==3.10"
|
||||
iso8601 = "==2.1.0"
|
||||
isort = "==5.13.2"
|
||||
jinja2 = "==3.1.5"
|
||||
loguru = "==0.7.3"
|
||||
markdown-it-py = "==3.0.0"
|
||||
markupsafe = "==3.0.2"
|
||||
mdurl = "==0.1.2"
|
||||
mypy-extensions = "==1.0.0"
|
||||
orjson = "==3.10.14"
|
||||
packaging = "==24.2"
|
||||
passlib = "==1.7.4"
|
||||
pathspec = "==0.12.1"
|
||||
platformdirs = "==4.3.6"
|
||||
pycparser = "==2.22"
|
||||
pydantic = "==2.10.5"
|
||||
pydantic-core = "==2.27.2"
|
||||
pydantic-settings = "==2.7.1"
|
||||
pygments = "==2.19.1"
|
||||
pyjwt = "==2.10.1"
|
||||
pypika-tortoise = "==0.3.2"
|
||||
python-dotenv = "==1.0.1"
|
||||
python-multipart = "==0.0.20"
|
||||
pytz = "==2024.2"
|
||||
pyyaml = "==6.0.2"
|
||||
rich = "==13.9.4"
|
||||
rich-toolkit = "==0.13.2"
|
||||
ruff = "==0.9.1"
|
||||
shellingham = "==1.5.4"
|
||||
sniffio = "==1.3.1"
|
||||
starlette = "==0.37.2"
|
||||
tortoise-orm = "==0.23.0"
|
||||
typer = "==0.15.1"
|
||||
typing-extensions = "==4.12.2"
|
||||
ujson = "==5.10.0"
|
||||
uvicorn = "==0.34.0"
|
||||
uvloop = "==0.21.0"
|
||||
watchfiles = "==1.0.4"
|
||||
websockets = "==14.1"
|
||||
asyncpg = "*"
|
||||
pandas = "*"
|
||||
openpyxl = "*"
|
||||
pysocks = "*"
|
||||
|
||||
[dev-packages]
|
||||
|
||||
[requires]
|
||||
python_version = "3.13"
|
||||
1537
Pipfile.lock
generated
Normal file
1537
Pipfile.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
22
README-en.md
Normal file
22
README-en.md
Normal file
@ -0,0 +1,22 @@
|
||||
docker run -d \
|
||||
--name clickhouse-server \
|
||||
--restart=unless-stopped \
|
||||
--ulimit nofile=262144:262144 \
|
||||
--ulimit memlock=-1:-1 \
|
||||
--cap-add=SYS_NICE \
|
||||
--cap-add=NET_ADMIN \
|
||||
--cap-add=SYS_RESOURCE \
|
||||
--security-opt seccomp=unconfined \
|
||||
--shm-size=8g \
|
||||
-p 8123:8123 \
|
||||
-p 9000:9000 \
|
||||
-p 9004:9004 \
|
||||
-p 9005:9005 \
|
||||
-p 9009:9009 \
|
||||
-e CLICKHOUSE_DB=job_data \
|
||||
-e CLICKHOUSE_USER=data_user \
|
||||
-e CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1 \
|
||||
-e CLICKHOUSE_PASSWORD=data_pass \
|
||||
-v $PWD/ch_data:/var/lib/clickhouse/ \
|
||||
-v $PWD/ch_logs:/var/log/clickhouse-server/ \
|
||||
clickhouse/clickhouse-server:latest
|
||||
24
README.md
Normal file
24
README.md
Normal file
@ -0,0 +1,24 @@
|
||||
export DOCKER_DEFAULT_PLATFORM=linux/amd64
|
||||
docker build -t zfc931912343/admin-crawler:v2.1 .
|
||||
docker push zfc931912343/admin-crawler:v2.1
|
||||
|
||||
|
||||
|
||||
docker build -t zfc931912343/boss-crawler:v1 .
|
||||
docker push zfc931912343/boss-crawler:v1
|
||||
|
||||
sudo docker rm -f admin-crawler &&sudo docker run -d --restart=always --name=admin-crawler --log-driver=json-file --log-opt max-size=10m --log-opt max-file=7 -p 9999:80 nbg2akd8w5diy8.xuanyuan.run/zfc931912343/admin-crawler:v1.5
|
||||
|
||||
|
||||
|
||||
|
||||
docker run -d \
|
||||
--name mysql-server \
|
||||
--restart always \
|
||||
-p 3306:3306 \
|
||||
-v /opt/mysql/data:/var/lib/mysql \
|
||||
-e MYSQL_ROOT_PASSWORD=jobdata123 \
|
||||
-e MYSQL_DATABASE=job_data \
|
||||
mysql:8.0 \
|
||||
--character-set-server=utf8mb4 \
|
||||
--collation-server=utf8mb4_unicode_ci
|
||||
57
app/__init__.py
Normal file
57
app/__init__.py
Normal file
@ -0,0 +1,57 @@
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
import os
|
||||
from tortoise import Tortoise
|
||||
|
||||
from app.core.exceptions import SettingNotFound
|
||||
from app.core.init_app import (
|
||||
init_data,
|
||||
make_middlewares,
|
||||
register_exceptions,
|
||||
register_routers,
|
||||
)
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.core.scheduler import start_scheduler, shutdown_scheduler
|
||||
|
||||
try:
|
||||
from app.settings.config import settings
|
||||
except ImportError:
|
||||
raise SettingNotFound("Can not import settings")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
await Tortoise.init(config=settings.TORTOISE_ORM)
|
||||
await Tortoise.generate_schemas()
|
||||
await init_data()
|
||||
start_scheduler()
|
||||
yield
|
||||
# 清理所有数据库连接
|
||||
await Tortoise.close_connections()
|
||||
await clickhouse_manager.close()
|
||||
shutdown_scheduler()
|
||||
|
||||
|
||||
def create_app() -> FastAPI:
|
||||
app = FastAPI(
|
||||
title=settings.APP_TITLE,
|
||||
description=settings.APP_DESCRIPTION,
|
||||
version=settings.VERSION,
|
||||
openapi_url="/openapi.json",
|
||||
middleware=make_middlewares(),
|
||||
lifespan=lifespan,
|
||||
)
|
||||
register_exceptions(app)
|
||||
register_routers(app, prefix="/api")
|
||||
|
||||
# Mount static files
|
||||
static_dir = os.path.join(settings.BASE_DIR, "static")
|
||||
if os.path.exists(static_dir):
|
||||
app.mount("/static", StaticFiles(directory=static_dir), name="static")
|
||||
|
||||
return app
|
||||
|
||||
|
||||
app = create_app()
|
||||
9
app/api/__init__.py
Normal file
9
app/api/__init__.py
Normal file
@ -0,0 +1,9 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .v1 import v1_router
|
||||
|
||||
api_router = APIRouter()
|
||||
api_router.include_router(v1_router, prefix="/v1")
|
||||
|
||||
|
||||
__all__ = ["api_router"]
|
||||
39
app/api/v1/__init__.py
Normal file
39
app/api/v1/__init__.py
Normal file
@ -0,0 +1,39 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core.dependency import DependPermission
|
||||
|
||||
from .apis import apis_router
|
||||
from .auditlog import auditlog_router
|
||||
from .base import base_router
|
||||
from .depts import depts_router
|
||||
from .menus import menus_router
|
||||
from .roles import roles_router
|
||||
from .users import users_router
|
||||
from .token import token_router
|
||||
from .proxy import proxy_router
|
||||
from .job import job_router
|
||||
from .stats import stats_router
|
||||
from .pipeline import pipeline_router
|
||||
from .keyword import keyword_router
|
||||
from .cleaning import cleaning_router
|
||||
from .analytics import router as analytics_router
|
||||
|
||||
|
||||
v1_router = APIRouter()
|
||||
|
||||
v1_router.include_router(base_router, prefix="/base")
|
||||
v1_router.include_router(users_router, prefix="/user", dependencies=[DependPermission])
|
||||
v1_router.include_router(roles_router, prefix="/role", dependencies=[DependPermission])
|
||||
v1_router.include_router(menus_router, prefix="/menu", dependencies=[DependPermission])
|
||||
v1_router.include_router(apis_router, prefix="/api", dependencies=[DependPermission])
|
||||
v1_router.include_router(depts_router, prefix="/dept", dependencies=[DependPermission])
|
||||
v1_router.include_router(auditlog_router, prefix="/auditlog", dependencies=[DependPermission])
|
||||
v1_router.include_router(job_router, prefix="/job", tags=["数据入库"])
|
||||
v1_router.include_router(job_router, prefix="/universal", tags=["通用数据接口"])
|
||||
v1_router.include_router(token_router, prefix="/token", tags=["Token管理"])
|
||||
v1_router.include_router(proxy_router, prefix="/proxy", tags=["代理IP管理"])
|
||||
v1_router.include_router(stats_router, prefix="/stats")
|
||||
v1_router.include_router(pipeline_router, prefix="/pipeline")
|
||||
v1_router.include_router(keyword_router, prefix="/keyword")
|
||||
v1_router.include_router(cleaning_router, prefix="/cleaning", dependencies=[DependPermission])
|
||||
v1_router.include_router(analytics_router, prefix="/analytics", tags=["数据分析"])
|
||||
92
app/api/v1/analytics.py
Normal file
92
app/api/v1/analytics.py
Normal file
@ -0,0 +1,92 @@
|
||||
from typing import Optional, List
|
||||
from datetime import datetime, date, timezone
|
||||
try:
|
||||
from zoneinfo import ZoneInfo
|
||||
except ImportError:
|
||||
from backports.zoneinfo import ZoneInfo
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.services.analytics_service import AnalyticsService
|
||||
from app.schemas.analytics import (
|
||||
JobStatisticsResponse,
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
CHINA_TZ = ZoneInfo("Asia/Shanghai")
|
||||
|
||||
async def get_analytics_service() -> AnalyticsService:
|
||||
client = await clickhouse_manager.get_client()
|
||||
return AnalyticsService(client)
|
||||
|
||||
def to_utc(dt: datetime) -> datetime:
|
||||
"""将本地(上海)时间转换为UTC"""
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=CHINA_TZ)
|
||||
return dt.astimezone(timezone.utc)
|
||||
|
||||
@router.get("/overview", response_model=JobStatisticsResponse, summary="获取职位统计总览")
|
||||
async def get_overview(
|
||||
from_date: Optional[date] = None,
|
||||
to_date: Optional[date] = None,
|
||||
city: Optional[str] = None,
|
||||
service: AnalyticsService = Depends(get_analytics_service)
|
||||
):
|
||||
from_dt = to_utc(datetime.combine(from_date, datetime.min.time())) if from_date else None
|
||||
to_dt = to_utc(datetime.combine(to_date, datetime.max.time())) if to_date else None
|
||||
|
||||
filters = {}
|
||||
if city:
|
||||
filters["city"] = city
|
||||
|
||||
return await service.get_job_statistics(filters=filters, from_dt=from_dt, to_dt=to_dt)
|
||||
|
||||
@router.get("/trend/volume", summary="获取数据量趋势")
|
||||
async def get_volume_trend(
|
||||
interval: str = Query("day", regex="^(day|hour|week|month)$"),
|
||||
from_date: Optional[date] = None,
|
||||
to_date: Optional[date] = None,
|
||||
from_datetime: Optional[datetime] = None,
|
||||
to_datetime: Optional[datetime] = None,
|
||||
service: AnalyticsService = Depends(get_analytics_service)
|
||||
):
|
||||
# 兼容小时粒度的精确时间窗口,其它粒度按日期转换为起止时间
|
||||
if from_datetime:
|
||||
from_dt = to_utc(from_datetime)
|
||||
elif from_date:
|
||||
from_dt = to_utc(datetime.combine(from_date, datetime.min.time()))
|
||||
else:
|
||||
from_dt = None
|
||||
|
||||
if to_datetime:
|
||||
to_dt = to_utc(to_datetime)
|
||||
elif to_date:
|
||||
to_dt = to_utc(datetime.combine(to_date, datetime.max.time()))
|
||||
else:
|
||||
to_dt = None
|
||||
|
||||
return await service.get_volume_trend(interval=interval, from_dt=from_dt, to_dt=to_dt)
|
||||
|
||||
@router.get("/distribution/source", summary="获取数据来源分布")
|
||||
async def get_source_distribution(
|
||||
from_date: Optional[date] = None,
|
||||
to_date: Optional[date] = None,
|
||||
from_datetime: Optional[datetime] = None,
|
||||
to_datetime: Optional[datetime] = None,
|
||||
service: AnalyticsService = Depends(get_analytics_service)
|
||||
):
|
||||
if from_datetime:
|
||||
from_dt = to_utc(from_datetime)
|
||||
elif from_date:
|
||||
from_dt = to_utc(datetime.combine(from_date, datetime.min.time()))
|
||||
else:
|
||||
from_dt = None
|
||||
|
||||
if to_datetime:
|
||||
to_dt = to_utc(to_datetime)
|
||||
elif to_date:
|
||||
to_dt = to_utc(datetime.combine(to_date, datetime.max.time()))
|
||||
else:
|
||||
to_dt = None
|
||||
|
||||
return await service.get_source_distribution(from_dt=from_dt, to_dt=to_dt)
|
||||
8
app/api/v1/apis/__init__.py
Normal file
8
app/api/v1/apis/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .apis import router
|
||||
|
||||
apis_router = APIRouter()
|
||||
apis_router.include_router(router, tags=["API模块"])
|
||||
|
||||
__all__ = ["apis_router"]
|
||||
67
app/api/v1/apis/apis.py
Normal file
67
app/api/v1/apis/apis.py
Normal file
@ -0,0 +1,67 @@
|
||||
from fastapi import APIRouter, Query
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.controllers.api import api_controller
|
||||
from app.schemas import Success, SuccessExtra
|
||||
from app.schemas.apis import *
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/list", summary="查看API列表")
|
||||
async def list_api(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
path: str = Query(None, description="API路径"),
|
||||
summary: str = Query(None, description="API简介"),
|
||||
tags: str = Query(None, description="API模块"),
|
||||
):
|
||||
q = Q()
|
||||
if path:
|
||||
q &= Q(path__contains=path)
|
||||
if summary:
|
||||
q &= Q(summary__contains=summary)
|
||||
if tags:
|
||||
q &= Q(tags__contains=tags)
|
||||
total, api_objs = await api_controller.list(page=page, page_size=page_size, search=q, order=["tags", "id"])
|
||||
data = [await obj.to_dict() for obj in api_objs]
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@router.get("/get", summary="查看Api")
|
||||
async def get_api(
|
||||
id: int = Query(..., description="Api"),
|
||||
):
|
||||
api_obj = await api_controller.get(id=id)
|
||||
data = await api_obj.to_dict()
|
||||
return Success(data=data)
|
||||
|
||||
|
||||
@router.post("/create", summary="创建Api")
|
||||
async def create_api(
|
||||
api_in: ApiCreate,
|
||||
):
|
||||
await api_controller.create(obj_in=api_in)
|
||||
return Success(msg="Created Successfully")
|
||||
|
||||
|
||||
@router.post("/update", summary="更新Api")
|
||||
async def update_api(
|
||||
api_in: ApiUpdate,
|
||||
):
|
||||
await api_controller.update(id=api_in.id, obj_in=api_in)
|
||||
return Success(msg="Update Successfully")
|
||||
|
||||
|
||||
@router.delete("/delete", summary="删除Api")
|
||||
async def delete_api(
|
||||
api_id: int = Query(..., description="ApiID"),
|
||||
):
|
||||
await api_controller.remove(id=api_id)
|
||||
return Success(msg="Deleted Success")
|
||||
|
||||
|
||||
@router.post("/refresh", summary="刷新API列表")
|
||||
async def refresh_api():
|
||||
await api_controller.refresh_api()
|
||||
return Success(msg="OK")
|
||||
8
app/api/v1/auditlog/__init__.py
Normal file
8
app/api/v1/auditlog/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .auditlog import router
|
||||
|
||||
auditlog_router = APIRouter()
|
||||
auditlog_router.include_router(router, tags=["审计日志模块"])
|
||||
|
||||
__all__ = ["auditlog_router"]
|
||||
48
app/api/v1/auditlog/auditlog.py
Normal file
48
app/api/v1/auditlog/auditlog.py
Normal file
@ -0,0 +1,48 @@
|
||||
from datetime import datetime
|
||||
from fastapi import APIRouter, Query
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.models.admin import AuditLog
|
||||
from app.schemas import SuccessExtra
|
||||
from app.schemas.apis import *
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/list", summary="查看操作日志")
|
||||
async def get_audit_log_list(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
username: str = Query("", description="操作人名称"),
|
||||
module: str = Query("", description="功能模块"),
|
||||
method: str = Query("", description="请求方法"),
|
||||
summary: str = Query("", description="接口描述"),
|
||||
path: str = Query("", description="请求路径"),
|
||||
status: int = Query(None, description="状态码"),
|
||||
start_time: datetime = Query("", description="开始时间"),
|
||||
end_time: datetime = Query("", description="结束时间"),
|
||||
):
|
||||
q = Q()
|
||||
if username:
|
||||
q &= Q(username__icontains=username)
|
||||
if module:
|
||||
q &= Q(module__icontains=module)
|
||||
if method:
|
||||
q &= Q(method__icontains=method)
|
||||
if summary:
|
||||
q &= Q(summary__icontains=summary)
|
||||
if path:
|
||||
q &= Q(path__icontains=path)
|
||||
if status:
|
||||
q &= Q(status=status)
|
||||
if start_time and end_time:
|
||||
q &= Q(created_at__range=[start_time, end_time])
|
||||
elif start_time:
|
||||
q &= Q(created_at__gte=start_time)
|
||||
elif end_time:
|
||||
q &= Q(created_at__lte=end_time)
|
||||
|
||||
audit_log_objs = await AuditLog.filter(q).offset((page - 1) * page_size).limit(page_size).order_by("-created_at")
|
||||
total = await AuditLog.filter(q).count()
|
||||
data = [await audit_log.to_dict() for audit_log in audit_log_objs]
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
8
app/api/v1/base/__init__.py
Normal file
8
app/api/v1/base/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .base import router
|
||||
|
||||
base_router = APIRouter()
|
||||
base_router.include_router(router, tags=["基础模块"])
|
||||
|
||||
__all__ = ["base_router"]
|
||||
103
app/api/v1/base/base.py
Normal file
103
app/api/v1/base/base.py
Normal file
@ -0,0 +1,103 @@
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.controllers.user import user_controller
|
||||
from app.core.ctx import CTX_USER_ID
|
||||
from app.core.dependency import DependAuth
|
||||
from app.models.admin import Api, Menu, Role, User
|
||||
from app.schemas.base import Fail, Success
|
||||
from app.schemas.login import *
|
||||
from app.schemas.users import UpdatePassword
|
||||
from app.settings import settings
|
||||
from app.utils.jwt_utils import create_access_token
|
||||
from app.utils.password import get_password_hash, verify_password
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/access_token", summary="获取token")
|
||||
async def login_access_token(credentials: CredentialsSchema):
|
||||
user: User = await user_controller.authenticate(credentials)
|
||||
await user_controller.update_last_login(user.id)
|
||||
access_token_expires = timedelta(minutes=settings.JWT_ACCESS_TOKEN_EXPIRE_MINUTES)
|
||||
expire = datetime.now(timezone.utc) + access_token_expires
|
||||
|
||||
data = JWTOut(
|
||||
access_token=create_access_token(
|
||||
data=JWTPayload(
|
||||
user_id=user.id,
|
||||
username=user.username,
|
||||
is_superuser=user.is_superuser,
|
||||
exp=expire,
|
||||
)
|
||||
),
|
||||
username=user.username,
|
||||
)
|
||||
return Success(data=data.model_dump())
|
||||
|
||||
|
||||
@router.get("/userinfo", summary="查看用户信息", dependencies=[DependAuth])
|
||||
async def get_userinfo():
|
||||
user_id = CTX_USER_ID.get()
|
||||
user_obj = await user_controller.get(id=user_id)
|
||||
data = await user_obj.to_dict(exclude_fields=["password"])
|
||||
data["avatar"] = "https://avatars.githubusercontent.com/u/54677442?v=4"
|
||||
return Success(data=data)
|
||||
|
||||
|
||||
@router.get("/usermenu", summary="查看用户菜单", dependencies=[DependAuth])
|
||||
async def get_user_menu():
|
||||
user_id = CTX_USER_ID.get()
|
||||
user_obj = await User.filter(id=user_id).first()
|
||||
menus: list[Menu] = []
|
||||
if user_obj.is_superuser:
|
||||
menus = await Menu.all()
|
||||
else:
|
||||
role_objs: list[Role] = await user_obj.roles
|
||||
for role_obj in role_objs:
|
||||
menu = await role_obj.menus
|
||||
menus.extend(menu)
|
||||
menus = list(set(menus))
|
||||
parent_menus: list[Menu] = []
|
||||
for menu in menus:
|
||||
if menu.parent_id == 0:
|
||||
parent_menus.append(menu)
|
||||
res = []
|
||||
for parent_menu in parent_menus:
|
||||
parent_menu_dict = await parent_menu.to_dict()
|
||||
parent_menu_dict["children"] = []
|
||||
for menu in menus:
|
||||
if menu.parent_id == parent_menu.id:
|
||||
parent_menu_dict["children"].append(await menu.to_dict())
|
||||
res.append(parent_menu_dict)
|
||||
return Success(data=res)
|
||||
|
||||
|
||||
@router.get("/userapi", summary="查看用户API", dependencies=[DependAuth])
|
||||
async def get_user_api():
|
||||
user_id = CTX_USER_ID.get()
|
||||
user_obj = await User.filter(id=user_id).first()
|
||||
if user_obj.is_superuser:
|
||||
api_objs: list[Api] = await Api.all()
|
||||
apis = [api.method.lower() + api.path for api in api_objs]
|
||||
return Success(data=apis)
|
||||
role_objs: list[Role] = await user_obj.roles
|
||||
apis = []
|
||||
for role_obj in role_objs:
|
||||
api_objs: list[Api] = await role_obj.apis
|
||||
apis.extend([api.method.lower() + api.path for api in api_objs])
|
||||
apis = list(set(apis))
|
||||
return Success(data=apis)
|
||||
|
||||
|
||||
@router.post("/update_password", summary="修改密码", dependencies=[DependAuth])
|
||||
async def update_user_password(req_in: UpdatePassword):
|
||||
user_id = CTX_USER_ID.get()
|
||||
user = await user_controller.get(user_id)
|
||||
verified = verify_password(req_in.old_password, user.password)
|
||||
if not verified:
|
||||
return Fail(msg="旧密码验证错误!")
|
||||
user.password = get_password_hash(req_in.new_password)
|
||||
await user.save()
|
||||
return Success(msg="修改成功")
|
||||
5
app/api/v1/cleaning/__init__.py
Normal file
5
app/api/v1/cleaning/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
from fastapi import APIRouter
|
||||
from .cleaning import router
|
||||
|
||||
cleaning_router = APIRouter()
|
||||
cleaning_router.include_router(router, tags=["数据清洗"])
|
||||
321
app/api/v1/cleaning/cleaning.py
Normal file
321
app/api/v1/cleaning/cleaning.py
Normal file
@ -0,0 +1,321 @@
|
||||
from fastapi import APIRouter, File, UploadFile, Form, Body, Query
|
||||
from app.services.cleaning import CleaningService
|
||||
from app.services.company_cleaner import company_cleaner
|
||||
from app.controllers.cleaning import cleaning_controller
|
||||
from app.schemas import Success, SuccessExtra
|
||||
from app.models.cleaning import CleaningTask
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from tortoise.expressions import Q
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
router = APIRouter()
|
||||
cleaning_service = CleaningService()
|
||||
|
||||
@router.get("/stats", summary="获取公司清洗统计信息")
|
||||
async def get_stats():
|
||||
"""获取 ClickHouse 中待处理公司的统计信息"""
|
||||
client = await clickhouse_manager.get_client()
|
||||
|
||||
pending_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'pending'"
|
||||
pending_res = await client.query(pending_sql)
|
||||
pending_count = pending_res.result_rows[0][0] if pending_res.result_rows else 0
|
||||
|
||||
today_sql = "SELECT count() FROM job_data.pending_company FINAL WHERE status = 'done' AND toDate(updated_at) = today()"
|
||||
today_res = await client.query(today_sql)
|
||||
today_count = today_res.result_rows[0][0] if today_res.result_rows else 0
|
||||
|
||||
dist_sql = """
|
||||
SELECT source, status, count()
|
||||
FROM job_data.pending_company FINAL
|
||||
GROUP BY source, status
|
||||
ORDER BY source, status
|
||||
"""
|
||||
dist_res = await client.query(dist_sql)
|
||||
|
||||
stats = {
|
||||
"total_pending": pending_count,
|
||||
"today_processed": today_count,
|
||||
"details": []
|
||||
}
|
||||
|
||||
# Process distribution
|
||||
source_stats = {}
|
||||
for row in dist_res.result_rows:
|
||||
source, status, count = row
|
||||
if source not in source_stats:
|
||||
source_stats[source] = {"pending": 0, "done": 0, "failed": 0, "total": 0}
|
||||
|
||||
if status in source_stats[source]:
|
||||
source_stats[source][status] = count
|
||||
source_stats[source]["total"] += count
|
||||
|
||||
stats["details"] = [
|
||||
{"source": k, **v} for k, v in source_stats.items()
|
||||
]
|
||||
|
||||
return Success(data=stats)
|
||||
|
||||
|
||||
@router.get("/companies", summary="获取公司清洗列表")
|
||||
async def get_companies_list(
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(20, ge=1, le=100),
|
||||
source: Optional[str] = Query(None),
|
||||
status: Optional[str] = Query(None)
|
||||
):
|
||||
"""分页获取待处理公司列表详情"""
|
||||
client = await clickhouse_manager.get_client()
|
||||
offset = (page - 1) * page_size
|
||||
|
||||
where_clauses = []
|
||||
if source:
|
||||
where_clauses.append(f"source = '{source}'")
|
||||
if status:
|
||||
where_clauses.append(f"status = '{status}'")
|
||||
|
||||
where_sql = " WHERE " + " AND ".join(where_clauses) if where_clauses else ""
|
||||
|
||||
# Count
|
||||
count_sql = f"SELECT count() FROM job_data.pending_company FINAL {where_sql}"
|
||||
count_res = await client.query(count_sql)
|
||||
total = count_res.result_rows[0][0] if count_res.result_rows else 0
|
||||
|
||||
# Data
|
||||
sql = f"""
|
||||
SELECT source, company_id, company_name, status, error_msg, created_at, updated_at
|
||||
FROM job_data.pending_company FINAL
|
||||
{where_sql}
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT {page_size} OFFSET {offset}
|
||||
"""
|
||||
res = await client.query(sql)
|
||||
|
||||
data = []
|
||||
for row in res.result_rows:
|
||||
data.append({
|
||||
"source": row[0],
|
||||
"company_id": row[1],
|
||||
"company_name": row[2],
|
||||
"status": row[3],
|
||||
"error_msg": row[4],
|
||||
"created_at": row[5].isoformat() if row[5] else None,
|
||||
"updated_at": row[6].isoformat() if row[6] else None
|
||||
})
|
||||
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@router.get("/company-detail", summary="获取公司清洗详情")
|
||||
async def get_company_cleaning_detail(
|
||||
source: str = Query(..., description="数据源"),
|
||||
company_id: str = Query(..., description="公司ID"),
|
||||
company_name: Optional[str] = Query(None, description="公司名称"),
|
||||
):
|
||||
client = await clickhouse_manager.get_client()
|
||||
table_map = {
|
||||
"boss": "boss_company",
|
||||
"qcwy": "qcwy_company",
|
||||
"zhilian": "zhilian_company",
|
||||
}
|
||||
table = table_map.get(source)
|
||||
if not table:
|
||||
return Success(code=400, msg="不支持的数据源")
|
||||
|
||||
if source == "qcwy":
|
||||
sql = f"""
|
||||
SELECT json_data, company_name, created_at, updated_at
|
||||
FROM job_data.{table}
|
||||
WHERE JSONExtractString(json_data, 'companyId') = {{company_id:String}}
|
||||
OR JSONExtractString(json_data, 'coId') = {{company_id:String}}
|
||||
OR JSONExtractString(json_data, 'coinfo', 'coid') = {{company_id:String}}
|
||||
OR company_name = {{company_name:String}}
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
params = {
|
||||
"company_id": str(company_id),
|
||||
"company_name": str(company_name or ""),
|
||||
}
|
||||
else:
|
||||
if not company_name:
|
||||
return Success(code=400, msg="缺少公司名称")
|
||||
sql = f"""
|
||||
SELECT json_data, company_name, created_at, updated_at
|
||||
FROM job_data.{table}
|
||||
WHERE company_name = {{company_name:String}}
|
||||
ORDER BY updated_at DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
params = {"company_name": str(company_name)}
|
||||
|
||||
print(f"DEBUG: Executing SQL: {sql}")
|
||||
print(f"DEBUG: Params: {params}")
|
||||
res = await client.query(sql, parameters=params)
|
||||
if not res.result_rows:
|
||||
return Success(code=404, msg="未找到公司清洗结果")
|
||||
row = res.result_rows[0]
|
||||
raw_json = row[0]
|
||||
try:
|
||||
data = json.loads(raw_json)
|
||||
except Exception:
|
||||
data = {"raw": raw_json}
|
||||
return Success(
|
||||
data={
|
||||
"source": source,
|
||||
"company_id": company_id,
|
||||
"company_name": row[1],
|
||||
"created_at": row[2].isoformat() if row[2] else None,
|
||||
"updated_at": row[3].isoformat() if row[3] else None,
|
||||
"data": data,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@router.post("/collect-pending", summary="分析待处理数据")
|
||||
async def collect_pending_companies_api(
|
||||
limit: int = Body(1000, embed=True, ge=1, le=10000),
|
||||
source: Optional[str] = Body(None, embed=True)
|
||||
):
|
||||
"""
|
||||
分析招聘数据,收集待处理的公司ID到 pending_company 表
|
||||
"""
|
||||
await company_cleaner.collect_pending_companies(limit=limit, source=source)
|
||||
return Success(msg=f"已完成数据分析,已收集待处理公司(上限 {limit} 条)")
|
||||
|
||||
|
||||
@router.post("/run-pending", summary="手动执行待处理公司清洗")
|
||||
async def run_pending_companies(
|
||||
limit: int = Body(100, embed=True, ge=1, le=5000),
|
||||
source: Optional[str] = Body(None, embed=True),
|
||||
proxy: Optional[str] = Body(None, embed=True),
|
||||
max_delay_seconds: int = Body(5, embed=True),
|
||||
):
|
||||
"""
|
||||
手动触发待处理公司清洗任务
|
||||
|
||||
仅会处理当前状态为 pending 的记录,已经处理过的记录不会重复执行。
|
||||
"""
|
||||
await company_cleaner.process_pending_companies(
|
||||
limit=limit,
|
||||
source=source,
|
||||
proxy=proxy,
|
||||
max_delay_seconds=max_delay_seconds,
|
||||
)
|
||||
return Success(msg=f"已触发执行最近 {limit} 条待处理公司清洗任务")
|
||||
|
||||
|
||||
@router.post("/crawl-execute", summary="爬取并执行待处理公司清洗")
|
||||
async def crawl_execute_pending(
|
||||
limit: int = Body(100, embed=True, ge=1, le=5000),
|
||||
source: Optional[str] = Body(None, embed=True),
|
||||
proxy: Optional[str] = Body(None, embed=True),
|
||||
max_delay_seconds: int = Body(5, embed=True),
|
||||
):
|
||||
await company_cleaner.collect_pending_companies(source=source)
|
||||
await company_cleaner.process_pending_companies(
|
||||
limit=limit,
|
||||
source=source,
|
||||
proxy=proxy,
|
||||
max_delay_seconds=max_delay_seconds,
|
||||
)
|
||||
return Success(msg=f"已触发爬取并执行最近 {limit} 条待处理公司清洗任务")
|
||||
|
||||
|
||||
@router.post("/process-company", summary="执行单个公司清洗任务")
|
||||
async def process_single_company_api(
|
||||
source: str = Body(..., embed=True),
|
||||
company_id: str = Body(..., embed=True),
|
||||
proxy: Optional[str] = Body(None, embed=True),
|
||||
max_delay_seconds: int = Body(5, embed=True),
|
||||
):
|
||||
result = await company_cleaner.process_single_company(
|
||||
source=source,
|
||||
company_id=company_id,
|
||||
proxy=proxy,
|
||||
max_delay_seconds=max_delay_seconds,
|
||||
)
|
||||
success = bool(result.get("success"))
|
||||
msg = "任务执行成功" if success else "任务执行失败"
|
||||
return Success(msg=msg, data=result)
|
||||
|
||||
@router.post("/upload", summary="上传文件并保存任务")
|
||||
async def upload_file(
|
||||
file: UploadFile = File(...),
|
||||
clean_type: str = Form("auto"),
|
||||
platform: str = Form("auto"),
|
||||
proxy: Optional[str] = Form(None)
|
||||
):
|
||||
targets = await cleaning_service.parse_file(file)
|
||||
tasks = [
|
||||
CleaningTask(
|
||||
target=t,
|
||||
clean_type=clean_type,
|
||||
platform=platform,
|
||||
proxy=proxy,
|
||||
status="pending"
|
||||
) for t in targets
|
||||
]
|
||||
|
||||
if tasks:
|
||||
await CleaningTask.bulk_create(tasks)
|
||||
|
||||
return Success(msg=f"Successfully imported {len(tasks)} tasks")
|
||||
|
||||
@router.get("/list", summary="获取清洗任务列表")
|
||||
async def list_tasks(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
target: str = Query(None, description="目标搜索"),
|
||||
status: str = Query(None, description="状态筛选"),
|
||||
clean_type: str = Query(None, description="清洗类型筛选")
|
||||
):
|
||||
q = Q()
|
||||
if target:
|
||||
q &= Q(target__contains=target)
|
||||
if status:
|
||||
q &= Q(status=status)
|
||||
if clean_type:
|
||||
q &= Q(clean_type=clean_type)
|
||||
|
||||
total, tasks = await cleaning_controller.list(page=page, page_size=page_size, search=q, order=["-created_at"])
|
||||
data = [await t.to_dict() for t in tasks]
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
@router.post("/process/{task_id}", summary="处理单个任务")
|
||||
async def process_task(task_id: int):
|
||||
task = await cleaning_controller.get(id=task_id)
|
||||
if not task:
|
||||
return Success(code=404, msg="Task not found")
|
||||
|
||||
task.status = "processing"
|
||||
await task.save()
|
||||
|
||||
result = await cleaning_service.process_single_item(
|
||||
target=task.target,
|
||||
clean_type=task.clean_type,
|
||||
platform=task.platform,
|
||||
proxy=task.proxy,
|
||||
)
|
||||
|
||||
task.status = "success" if result.get("success") else "fail"
|
||||
task.storage_status = result.get("storage_status", "unknown")
|
||||
task.remote_sent = result.get("remote_sent", False)
|
||||
task.result_summary = result.get("data_summary")
|
||||
task.error_msg = result.get("error")
|
||||
|
||||
await task.save()
|
||||
|
||||
return Success(data=await task.to_dict(), msg="Task processed")
|
||||
|
||||
@router.delete("/delete", summary="删除任务")
|
||||
async def delete_task(
|
||||
id: int = Query(..., description="任务ID")
|
||||
):
|
||||
await cleaning_controller.remove(id=id)
|
||||
return Success(msg="Deleted Successfully")
|
||||
|
||||
@router.post("/clear", summary="清空所有任务")
|
||||
async def clear_tasks():
|
||||
await CleaningTask.all().delete()
|
||||
return Success(msg="All tasks cleared")
|
||||
8
app/api/v1/depts/__init__.py
Normal file
8
app/api/v1/depts/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .depts import router
|
||||
|
||||
depts_router = APIRouter()
|
||||
depts_router.include_router(router, tags=["部门模块"])
|
||||
|
||||
__all__ = ["depts_router"]
|
||||
48
app/api/v1/depts/depts.py
Normal file
48
app/api/v1/depts/depts.py
Normal file
@ -0,0 +1,48 @@
|
||||
from fastapi import APIRouter, Query
|
||||
|
||||
from app.controllers.dept import dept_controller
|
||||
from app.schemas import Success
|
||||
from app.schemas.depts import *
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/list", summary="查看部门列表")
|
||||
async def list_dept(
|
||||
name: str = Query(None, description="部门名称"),
|
||||
):
|
||||
dept_tree = await dept_controller.get_dept_tree(name)
|
||||
return Success(data=dept_tree)
|
||||
|
||||
|
||||
@router.get("/get", summary="查看部门")
|
||||
async def get_dept(
|
||||
id: int = Query(..., description="部门ID"),
|
||||
):
|
||||
dept_obj = await dept_controller.get(id=id)
|
||||
data = await dept_obj.to_dict()
|
||||
return Success(data=data)
|
||||
|
||||
|
||||
@router.post("/create", summary="创建部门")
|
||||
async def create_dept(
|
||||
dept_in: DeptCreate,
|
||||
):
|
||||
await dept_controller.create_dept(obj_in=dept_in)
|
||||
return Success(msg="Created Successfully")
|
||||
|
||||
|
||||
@router.post("/update", summary="更新部门")
|
||||
async def update_dept(
|
||||
dept_in: DeptUpdate,
|
||||
):
|
||||
await dept_controller.update_dept(obj_in=dept_in)
|
||||
return Success(msg="Update Successfully")
|
||||
|
||||
|
||||
@router.delete("/delete", summary="删除部门")
|
||||
async def delete_dept(
|
||||
dept_id: int = Query(..., description="部门ID"),
|
||||
):
|
||||
await dept_controller.delete_dept(dept_id=dept_id)
|
||||
return Success(msg="Deleted Success")
|
||||
46
app/api/v1/ingest/ingest.py
Normal file
46
app/api/v1/ingest/ingest.py
Normal file
@ -0,0 +1,46 @@
|
||||
from typing import Optional, List, Dict, Any
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from pydantic import BaseModel, Field
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.services.ingest_service import IngestService
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class IngestSingleRequest(BaseModel):
|
||||
platform: str = Field(...)
|
||||
data_type: str = Field(...)
|
||||
data: Dict[str, Any] = Field(...)
|
||||
check_duplicate: bool = Field(True)
|
||||
|
||||
|
||||
class IngestBatchRequest(BaseModel):
|
||||
platform: str = Field(...)
|
||||
data_type: str = Field(...)
|
||||
data_list: List[Dict[str, Any]] = Field(...)
|
||||
check_duplicate: bool = Field(True)
|
||||
|
||||
|
||||
async def get_service() -> IngestService:
|
||||
client: AsyncClient = await clickhouse_manager.get_client()
|
||||
return IngestService(client)
|
||||
|
||||
|
||||
@router.post("/data")
|
||||
async def ingest_data(req: IngestSingleRequest, service: IngestService = Depends(get_service)):
|
||||
try:
|
||||
res = await service.store_single(req.platform, req.data_type, req.data, req.check_duplicate)
|
||||
return {"code": 200, "data": res, "message": "ok"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
|
||||
@router.post("/batch")
|
||||
async def ingest_batch(req: IngestBatchRequest, service: IngestService = Depends(get_service)):
|
||||
try:
|
||||
res = await service.store_batch(req.platform, req.data_type, req.data_list, req.check_duplicate)
|
||||
return {"code": 200, "data": res, "message": "ok"}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
8
app/api/v1/job/__init__.py
Normal file
8
app/api/v1/job/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .job import router
|
||||
|
||||
job_router = APIRouter()
|
||||
job_router.include_router(router, tags=["数据上报"])
|
||||
|
||||
__all__ = ["job_router"]
|
||||
252
app/api/v1/job/job.py
Normal file
252
app/api/v1/job/job.py
Normal file
@ -0,0 +1,252 @@
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends
|
||||
from typing import Dict, Any
|
||||
|
||||
from app.services.job import create_data_router_service, PlatformType, DataType
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.controllers.job import (
|
||||
UniversalDataController,
|
||||
UniversalDataRequest,
|
||||
BatchDataRequest,
|
||||
create_universal_data_controller
|
||||
)
|
||||
|
||||
|
||||
router = APIRouter(tags=["通用数据接口"])
|
||||
|
||||
|
||||
async def get_universal_data_controller() -> UniversalDataController:
|
||||
"""获取通用数据控制器实例"""
|
||||
clickhouse_client = await clickhouse_manager.get_client()
|
||||
data_router_service = create_data_router_service(clickhouse_client)
|
||||
return create_universal_data_controller(data_router_service)
|
||||
|
||||
|
||||
@router.post("/data/store", summary="存储单条数据")
|
||||
async def store_single_data(
|
||||
request: UniversalDataRequest,
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
通用数据存储接口 - 存储单条数据
|
||||
|
||||
支持的平台:
|
||||
- boss: Boss直聘
|
||||
- qcwy: 前程无忧
|
||||
- zhilian: 智联招聘
|
||||
|
||||
支持的数据类型:
|
||||
- job: 职位数据
|
||||
- company: 公司数据
|
||||
|
||||
示例请求:
|
||||
```json
|
||||
{
|
||||
"data": {
|
||||
"jobBaseInfoVO": {
|
||||
"encryptJobId": "abc123",
|
||||
"positionName": "Python开发工程师",
|
||||
"locationName": "北京"
|
||||
},
|
||||
"brandComInfoVO": {
|
||||
"brandName": "某科技公司",
|
||||
"industryName": "互联网"
|
||||
}
|
||||
},
|
||||
"data_type": "job",
|
||||
"platform": "boss",
|
||||
"check_duplicate": true,
|
||||
"duplicate_key": "encrypt_job_id"
|
||||
}
|
||||
```
|
||||
"""
|
||||
return await controller.store_single_data(request)
|
||||
|
||||
|
||||
@router.post("/data/batch-store", summary="批量存储数据")
|
||||
async def store_batch_data(
|
||||
request: BatchDataRequest,
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
通用数据存储接口 - 批量存储数据
|
||||
|
||||
示例请求:
|
||||
```json
|
||||
{
|
||||
"data_list": [
|
||||
{
|
||||
"jobBaseInfoVO": {
|
||||
"encryptJobId": "abc123",
|
||||
"positionName": "Python开发工程师"
|
||||
}
|
||||
},
|
||||
{
|
||||
"jobBaseInfoVO": {
|
||||
"encryptJobId": "def456",
|
||||
"positionName": "Java开发工程师"
|
||||
}
|
||||
}
|
||||
],
|
||||
"data_type": "job",
|
||||
"platform": "boss",
|
||||
"check_duplicate": true
|
||||
}
|
||||
```
|
||||
"""
|
||||
return await controller.store_batch_data(request)
|
||||
|
||||
|
||||
@router.post("/data/store-async", summary="异步存储单条数据")
|
||||
async def store_single_data_async(
|
||||
request: UniversalDataRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
通用数据存储接口 - 异步存储单条数据
|
||||
|
||||
适用于大量数据或不需要立即返回结果的场景
|
||||
"""
|
||||
return await controller.store_single_data_async(background_tasks, request)
|
||||
|
||||
|
||||
@router.post("/data/batch-store-async", summary="异步批量存储数据")
|
||||
async def store_batch_data_async(
|
||||
request: BatchDataRequest,
|
||||
background_tasks: BackgroundTasks,
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
通用数据存储接口 - 异步批量存储数据
|
||||
|
||||
适用于大批量数据处理场景
|
||||
"""
|
||||
return await controller.store_batch_data_async(background_tasks, request)
|
||||
|
||||
|
||||
@router.get("/data", summary="查询数据")
|
||||
async def query_data(
|
||||
platform: str,
|
||||
data_type: str,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
通用数据查询接口
|
||||
|
||||
参数:
|
||||
- platform: 平台类型 (boss/qcwy/zhilian)
|
||||
- data_type: 数据类型 (job/company)
|
||||
- page: 页码,默认1
|
||||
- page_size: 每页大小,默认20
|
||||
"""
|
||||
# 转换字符串参数为枚举类型
|
||||
platform_enum = PlatformType(platform)
|
||||
data_type_enum = DataType(data_type)
|
||||
return await controller.query_data(platform_enum, data_type_enum, page, page_size)
|
||||
|
||||
@router.get("/platforms", summary="获取支持的平台和数据类型")
|
||||
async def get_supported_platforms(
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
获取支持的平台和数据类型信息
|
||||
|
||||
返回:
|
||||
- 支持的平台列表
|
||||
- 支持的数据类型列表
|
||||
- 各平台的默认重复检查字段
|
||||
"""
|
||||
return await controller.get_supported_platforms()
|
||||
|
||||
|
||||
# 为了兼容性,提供平台特定的路由别名
|
||||
@router.post("/boss/job", summary="Boss直聘职位数据存储")
|
||||
async def store_boss_job_data(
|
||||
data: Dict[str, Any],
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""Boss直聘职位数据存储的便捷接口"""
|
||||
request = UniversalDataRequest(
|
||||
data=data,
|
||||
data_type="job",
|
||||
platform="boss",
|
||||
check_duplicate=True,
|
||||
)
|
||||
return await controller.store_single_data(request)
|
||||
|
||||
|
||||
@router.post("/boss/company", summary="Boss直聘公司数据存储")
|
||||
async def store_boss_company_data(
|
||||
data: Dict[str, Any],
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""Boss直聘公司数据存储的便捷接口"""
|
||||
request = UniversalDataRequest(
|
||||
data=data,
|
||||
data_type="company",
|
||||
platform="boss",
|
||||
check_duplicate=True,
|
||||
)
|
||||
return await controller.store_single_data(request)
|
||||
|
||||
|
||||
@router.post("/qcwy/job", summary="前程无忧职位数据存储")
|
||||
async def store_qcwy_job_data(
|
||||
data: Dict[str, Any],
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""前程无忧职位数据存储的便捷接口"""
|
||||
request = UniversalDataRequest(
|
||||
data=data,
|
||||
data_type="job",
|
||||
platform="qcwy",
|
||||
check_duplicate=True,
|
||||
)
|
||||
return await controller.store_single_data(request)
|
||||
|
||||
|
||||
@router.post("/qcwy/company", summary="前程无忧公司数据存储")
|
||||
async def store_qcwy_company_data(
|
||||
data: Dict[str, Any],
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""前程无忧公司数据存储的便捷接口"""
|
||||
request = UniversalDataRequest(
|
||||
data=data,
|
||||
data_type="company",
|
||||
platform="qcwy",
|
||||
check_duplicate=True,
|
||||
)
|
||||
return await controller.store_single_data(request)
|
||||
|
||||
|
||||
@router.post("/zhilian/job", summary="智联招聘职位数据存储")
|
||||
async def store_zhilian_job_data(
|
||||
data: Dict[str, Any],
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""智联招聘职位数据存储的便捷接口"""
|
||||
request = UniversalDataRequest(
|
||||
data=data,
|
||||
data_type="job",
|
||||
platform="zhilian",
|
||||
check_duplicate=True,
|
||||
)
|
||||
return await controller.store_single_data(request)
|
||||
|
||||
|
||||
@router.post("/zhilian/company", summary="智联招聘公司数据存储")
|
||||
async def store_zhilian_company_data(
|
||||
data: Dict[str, Any],
|
||||
controller: UniversalDataController = Depends(get_universal_data_controller)
|
||||
) -> Dict[str, Any]:
|
||||
"""智联招聘公司数据存储的便捷接口"""
|
||||
request = UniversalDataRequest(
|
||||
data=data,
|
||||
data_type="company",
|
||||
platform="zhilian",
|
||||
check_duplicate=True,
|
||||
)
|
||||
return await controller.store_single_data(request)
|
||||
9
app/api/v1/keyword/__init__.py
Normal file
9
app/api/v1/keyword/__init__.py
Normal file
@ -0,0 +1,9 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .keyword import router
|
||||
|
||||
keyword_router = APIRouter()
|
||||
keyword_router.include_router(router, tags=["关键词接口"])
|
||||
|
||||
__all__ = ["keyword_router"]
|
||||
|
||||
182
app/api/v1/keyword/keyword.py
Normal file
182
app/api/v1/keyword/keyword.py
Normal file
@ -0,0 +1,182 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.controllers.keyword import KeywordController
|
||||
from app.core.dependency import DependPermission
|
||||
from app.schemas.keyword import KeywordCreate, KeywordUpdate
|
||||
|
||||
router = APIRouter(tags=["关键词接口"])
|
||||
|
||||
|
||||
class MarkUsedRequest(BaseModel):
|
||||
source: str = Field(pattern="^(boss|qcwy|zhilian)$")
|
||||
ids: List[int]
|
||||
|
||||
|
||||
class StatsQuery(BaseModel):
|
||||
source: str = Field(pattern="^(boss|qcwy|zhilian)$")
|
||||
date: str | None = None
|
||||
|
||||
|
||||
async def get_keyword_controller() -> KeywordController:
|
||||
"""获取关键词控制器实例
|
||||
|
||||
返回:
|
||||
关键词控制器实例
|
||||
"""
|
||||
return KeywordController()
|
||||
|
||||
|
||||
@router.get("/available", summary="获取当天未使用的检索条件")
|
||||
async def get_available(
|
||||
source: str,
|
||||
limit: int = 1,
|
||||
reserve: bool = True,
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""根据平台获取当天未使用的检索条件
|
||||
|
||||
参数:
|
||||
source: 平台标识,boss|qcwy|zhilian
|
||||
limit: 返回数量上限,默认1
|
||||
|
||||
返回:
|
||||
标准字典结构,包含 items/total/limit
|
||||
"""
|
||||
return await controller.get_available(source, limit, reserve)
|
||||
|
||||
|
||||
@router.post("/mark-used", summary="将检索条件标记为今日已使用")
|
||||
async def mark_used(
|
||||
request: MarkUsedRequest,
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""将指定检索条件标记为今日已使用
|
||||
|
||||
参数:
|
||||
request: 包含平台标识与记录ID列表的请求体
|
||||
|
||||
返回:
|
||||
更新结果,包含成功条数与日期
|
||||
"""
|
||||
return await controller.mark_used(request.source, request.ids)
|
||||
|
||||
|
||||
@router.get("/stats", summary="统计使用与未使用数量")
|
||||
async def get_stats(
|
||||
source: str,
|
||||
date: str | None = None,
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""统计指定平台在某日期的使用与未使用数量
|
||||
|
||||
参数:
|
||||
source: 平台标识,boss|qcwy|zhilian
|
||||
date: 统计日期,格式 YYYY-MM-DD;不传则为今天
|
||||
|
||||
返回:
|
||||
标准字典结构,包含 total/used/unused
|
||||
"""
|
||||
from datetime import date as _date
|
||||
d = None
|
||||
if date:
|
||||
try:
|
||||
y, m, d0 = map(int, date.split("-"))
|
||||
d = _date(y, m, d0)
|
||||
except Exception:
|
||||
d = None
|
||||
return await controller.get_stats(source, d)
|
||||
|
||||
|
||||
@router.get("/overview", summary="获取所有平台统计概览", dependencies=[DependPermission])
|
||||
async def get_overview(
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""获取所有平台统计概览
|
||||
|
||||
返回:
|
||||
各平台统计数据
|
||||
"""
|
||||
return await controller.get_overview_stats()
|
||||
|
||||
|
||||
@router.get("/list", summary="获取关键词列表", dependencies=[DependPermission])
|
||||
async def list_keywords(
|
||||
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
city: str | None = None,
|
||||
job: str | None = None,
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""获取关键词列表
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
page: 页码
|
||||
page_size: 每页数量
|
||||
city: 城市过滤
|
||||
job: 职位过滤
|
||||
|
||||
返回:
|
||||
列表数据
|
||||
"""
|
||||
return await controller.list_keywords(source, page, page_size, city, job)
|
||||
|
||||
|
||||
@router.post("/create", summary="创建关键词", dependencies=[DependPermission])
|
||||
async def create_keyword(
|
||||
item: KeywordCreate,
|
||||
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""创建关键词
|
||||
|
||||
参数:
|
||||
item: 关键词数据
|
||||
source: 平台标识
|
||||
|
||||
返回:
|
||||
创建结果
|
||||
"""
|
||||
return await controller.create_keyword(source, item)
|
||||
|
||||
|
||||
@router.put("/update", summary="更新关键词", dependencies=[DependPermission])
|
||||
async def update_keyword(
|
||||
id: int,
|
||||
item: KeywordUpdate,
|
||||
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""更新关键词
|
||||
|
||||
参数:
|
||||
id: 记录ID
|
||||
item: 更新数据
|
||||
source: 平台标识
|
||||
|
||||
返回:
|
||||
更新结果
|
||||
"""
|
||||
return await controller.update_keyword(source, id, item)
|
||||
|
||||
|
||||
@router.delete("/delete", summary="删除关键词", dependencies=[DependPermission])
|
||||
async def delete_keyword(
|
||||
id: int,
|
||||
source: str = Query(..., pattern="^(boss|qcwy|zhilian)$"),
|
||||
controller: KeywordController = Depends(get_keyword_controller),
|
||||
) -> Dict[str, Any]:
|
||||
"""删除关键词
|
||||
|
||||
参数:
|
||||
id: 记录ID
|
||||
source: 平台标识
|
||||
|
||||
返回:
|
||||
删除结果
|
||||
"""
|
||||
return await controller.delete_keyword(source, id)
|
||||
8
app/api/v1/menus/__init__.py
Normal file
8
app/api/v1/menus/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .menus import router
|
||||
|
||||
menus_router = APIRouter()
|
||||
menus_router.include_router(router, tags=["菜单模块"])
|
||||
|
||||
__all__ = ["menus_router"]
|
||||
63
app/api/v1/menus/menus.py
Normal file
63
app/api/v1/menus/menus.py
Normal file
@ -0,0 +1,63 @@
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter, Query
|
||||
|
||||
from app.controllers.menu import menu_controller
|
||||
from app.schemas.base import Fail, Success, SuccessExtra
|
||||
from app.schemas.menus import *
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/list", summary="查看菜单列表")
|
||||
async def list_menu(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
):
|
||||
async def get_menu_with_children(menu_id: int):
|
||||
menu = await menu_controller.model.get(id=menu_id)
|
||||
menu_dict = await menu.to_dict()
|
||||
child_menus = await menu_controller.model.filter(parent_id=menu_id).order_by("order")
|
||||
menu_dict["children"] = [await get_menu_with_children(child.id) for child in child_menus]
|
||||
return menu_dict
|
||||
|
||||
parent_menus = await menu_controller.model.filter(parent_id=0).order_by("order")
|
||||
res_menu = [await get_menu_with_children(menu.id) for menu in parent_menus]
|
||||
return SuccessExtra(data=res_menu, total=len(res_menu), page=page, page_size=page_size)
|
||||
|
||||
|
||||
@router.get("/get", summary="查看菜单")
|
||||
async def get_menu(
|
||||
menu_id: int = Query(..., description="菜单id"),
|
||||
):
|
||||
result = await menu_controller.get(id=menu_id)
|
||||
return Success(data=result)
|
||||
|
||||
|
||||
@router.post("/create", summary="创建菜单")
|
||||
async def create_menu(
|
||||
menu_in: MenuCreate,
|
||||
):
|
||||
await menu_controller.create(obj_in=menu_in)
|
||||
return Success(msg="Created Success")
|
||||
|
||||
|
||||
@router.post("/update", summary="更新菜单")
|
||||
async def update_menu(
|
||||
menu_in: MenuUpdate,
|
||||
):
|
||||
await menu_controller.update(id=menu_in.id, obj_in=menu_in)
|
||||
return Success(msg="Updated Success")
|
||||
|
||||
|
||||
@router.delete("/delete", summary="删除菜单")
|
||||
async def delete_menu(
|
||||
id: int = Query(..., description="菜单id"),
|
||||
):
|
||||
child_menu_count = await menu_controller.model.filter(parent_id=id).count()
|
||||
if child_menu_count > 0:
|
||||
return Fail(msg="Cannot delete a menu with child menus")
|
||||
await menu_controller.remove(id=id)
|
||||
return Success(msg="Deleted Success")
|
||||
18
app/api/v1/pipeline.py
Normal file
18
app/api/v1/pipeline.py
Normal file
@ -0,0 +1,18 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core.scheduler import ecs_full_pipeline_job
|
||||
|
||||
|
||||
pipeline_router = APIRouter(tags=["ECS任务"])
|
||||
|
||||
|
||||
@pipeline_router.get("/trigger", summary="立即触发ECS全流程任务(无需鉴权)")
|
||||
async def trigger_ecs_pipeline():
|
||||
"""
|
||||
主动触发一次 ECS 全流程任务(删除→创建→安装云助手→执行命令)
|
||||
参数:无
|
||||
返回:执行状态提示
|
||||
用途:提供手动触发入口,便于在控制台或外部系统调用
|
||||
"""
|
||||
await ecs_full_pipeline_job()
|
||||
return {"code": 200, "message": "ECS 全流程任务已触发执行"}
|
||||
10
app/api/v1/proxy/__init__.py
Normal file
10
app/api/v1/proxy/__init__.py
Normal file
@ -0,0 +1,10 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .proxy import proxy_router as proxy_sub_router
|
||||
|
||||
|
||||
proxy_router = APIRouter()
|
||||
proxy_router.include_router(proxy_sub_router, tags=["代理IP管理"])
|
||||
|
||||
__all__ = ["proxy_router"]
|
||||
|
||||
174
app/api/v1/proxy/proxy.py
Normal file
174
app/api/v1/proxy/proxy.py
Normal file
@ -0,0 +1,174 @@
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
from fastapi import APIRouter, Query, HTTPException
|
||||
from tortoise.transactions import in_transaction
|
||||
|
||||
from app.models.cleaning import ProxyConfig, ProxyProvider
|
||||
from app.schemas.base import Success, SuccessExtra
|
||||
|
||||
|
||||
proxy_router = APIRouter()
|
||||
|
||||
|
||||
@proxy_router.get("/configs")
|
||||
async def list_proxy_configs(
|
||||
name: Optional[str] = Query(None, description="名称"),
|
||||
platform: Optional[str] = Query(None, description="平台标识"),
|
||||
proxy_type: Optional[str] = Query(None, description="代理类型: http/socks/tunnel"),
|
||||
is_active: Optional[bool] = Query(None, description="是否启用"),
|
||||
page: int = Query(1, ge=1, description="页码"),
|
||||
page_size: int = Query(10, ge=1, le=200, description="每页数量"),
|
||||
):
|
||||
qs = ProxyConfig.all()
|
||||
if name:
|
||||
qs = qs.filter(name__icontains=name)
|
||||
if platform:
|
||||
qs = qs.filter(platform=platform)
|
||||
if proxy_type:
|
||||
qs = qs.filter(proxy_type=proxy_type)
|
||||
if is_active is not None:
|
||||
qs = qs.filter(is_active=is_active)
|
||||
total = await qs.count()
|
||||
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
|
||||
data = [
|
||||
{
|
||||
"id": item.id,
|
||||
"name": item.name,
|
||||
"proxy_type": item.proxy_type,
|
||||
"platform": item.platform,
|
||||
"proxy_url": item.proxy_url,
|
||||
"is_active": item.is_active,
|
||||
"created_at": item.created_at,
|
||||
"updated_at": item.updated_at,
|
||||
}
|
||||
for item in items
|
||||
]
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@proxy_router.post("/configs")
|
||||
async def create_proxy_config(payload: Dict[str, Any]):
|
||||
try:
|
||||
async with in_transaction():
|
||||
item = await ProxyConfig.create(
|
||||
name=payload.get("name"),
|
||||
proxy_type=payload.get("proxy_type"),
|
||||
platform=payload.get("platform", "all"),
|
||||
proxy_url=payload.get("proxy_url"),
|
||||
is_active=bool(payload.get("is_active", True)),
|
||||
)
|
||||
return Success(data={"id": item.id})
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc))
|
||||
|
||||
|
||||
@proxy_router.put("/configs/{config_id}")
|
||||
async def update_proxy_config(config_id: int, payload: Dict[str, Any]):
|
||||
item = await ProxyConfig.get_or_none(id=config_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Proxy config not found")
|
||||
for field in ["name", "proxy_type", "platform", "proxy_url", "is_active"]:
|
||||
if field in payload:
|
||||
setattr(item, field, payload[field])
|
||||
await item.save()
|
||||
return Success(data={"id": item.id})
|
||||
|
||||
|
||||
@proxy_router.delete("/configs/{config_id}")
|
||||
async def delete_proxy_config(config_id: int):
|
||||
item = await ProxyConfig.get_or_none(id=config_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Proxy config not found")
|
||||
await item.delete()
|
||||
return Success(data={"id": config_id})
|
||||
|
||||
|
||||
@proxy_router.get("/providers")
|
||||
async def list_proxy_providers(
|
||||
name: Optional[str] = Query(None, description="名称"),
|
||||
platform: Optional[str] = Query(None, description="平台标识"),
|
||||
mode: Optional[str] = Query(None, description="解析模式"),
|
||||
page: int = Query(1, ge=1, description="页码"),
|
||||
page_size: int = Query(10, ge=1, le=200, description="每页数量"),
|
||||
):
|
||||
qs = ProxyProvider.all()
|
||||
if name:
|
||||
qs = qs.filter(name__icontains=name)
|
||||
if platform:
|
||||
qs = qs.filter(platform=platform)
|
||||
if mode:
|
||||
qs = qs.filter(mode=mode)
|
||||
total = await qs.count()
|
||||
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
|
||||
data = [
|
||||
{
|
||||
"id": item.id,
|
||||
"name": item.name,
|
||||
"platform": item.platform,
|
||||
"mode": item.mode,
|
||||
"list_path": item.list_path,
|
||||
"ip_path": item.ip_path,
|
||||
"port_path": item.port_path,
|
||||
"username_path": item.username_path,
|
||||
"password_path": item.password_path,
|
||||
"pattern": item.pattern,
|
||||
"template": item.template,
|
||||
"created_at": item.created_at,
|
||||
"updated_at": item.updated_at,
|
||||
}
|
||||
for item in items
|
||||
]
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@proxy_router.post("/providers")
|
||||
async def create_proxy_provider(payload: Dict[str, Any]):
|
||||
try:
|
||||
async with in_transaction():
|
||||
item = await ProxyProvider.create(
|
||||
name=payload.get("name"),
|
||||
platform=payload.get("platform", "all"),
|
||||
mode=payload.get("mode", "json"),
|
||||
list_path=payload.get("list_path"),
|
||||
ip_path=payload.get("ip_path"),
|
||||
port_path=payload.get("port_path"),
|
||||
username_path=payload.get("username_path"),
|
||||
password_path=payload.get("password_path"),
|
||||
pattern=payload.get("pattern"),
|
||||
template=payload.get("template"),
|
||||
)
|
||||
return Success(data={"id": item.id})
|
||||
except Exception as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc))
|
||||
|
||||
|
||||
@proxy_router.put("/providers/{provider_id}")
|
||||
async def update_proxy_provider(provider_id: int, payload: Dict[str, Any]):
|
||||
item = await ProxyProvider.get_or_none(id=provider_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Proxy provider not found")
|
||||
for field in [
|
||||
"name",
|
||||
"platform",
|
||||
"mode",
|
||||
"list_path",
|
||||
"ip_path",
|
||||
"port_path",
|
||||
"username_path",
|
||||
"password_path",
|
||||
"pattern",
|
||||
"template",
|
||||
]:
|
||||
if field in payload:
|
||||
setattr(item, field, payload[field])
|
||||
await item.save()
|
||||
return Success(data={"id": item.id})
|
||||
|
||||
|
||||
@proxy_router.delete("/providers/{provider_id}")
|
||||
async def delete_proxy_provider(provider_id: int):
|
||||
item = await ProxyProvider.get_or_none(id=provider_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Proxy provider not found")
|
||||
await item.delete()
|
||||
return Success(data={"id": provider_id})
|
||||
8
app/api/v1/roles/__init__.py
Normal file
8
app/api/v1/roles/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .roles import router
|
||||
|
||||
roles_router = APIRouter()
|
||||
roles_router.include_router(router, tags=["角色模块"])
|
||||
|
||||
__all__ = ["roles_router"]
|
||||
73
app/api/v1/roles/roles.py
Normal file
73
app/api/v1/roles/roles.py
Normal file
@ -0,0 +1,73 @@
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter, Query
|
||||
from fastapi.exceptions import HTTPException
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.controllers import role_controller
|
||||
from app.schemas.base import Success, SuccessExtra
|
||||
from app.schemas.roles import *
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/list", summary="查看角色列表")
|
||||
async def list_role(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
role_name: str = Query("", description="角色名称,用于查询"),
|
||||
):
|
||||
q = Q()
|
||||
if role_name:
|
||||
q = Q(name__contains=role_name)
|
||||
total, role_objs = await role_controller.list(page=page, page_size=page_size, search=q)
|
||||
data = [await obj.to_dict() for obj in role_objs]
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@router.get("/get", summary="查看角色")
|
||||
async def get_role(
|
||||
role_id: int = Query(..., description="角色ID"),
|
||||
):
|
||||
role_obj = await role_controller.get(id=role_id)
|
||||
return Success(data=await role_obj.to_dict())
|
||||
|
||||
|
||||
@router.post("/create", summary="创建角色")
|
||||
async def create_role(role_in: RoleCreate):
|
||||
if await role_controller.is_exist(name=role_in.name):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="The role with this rolename already exists in the system.",
|
||||
)
|
||||
await role_controller.create(obj_in=role_in)
|
||||
return Success(msg="Created Successfully")
|
||||
|
||||
|
||||
@router.post("/update", summary="更新角色")
|
||||
async def update_role(role_in: RoleUpdate):
|
||||
await role_controller.update(id=role_in.id, obj_in=role_in)
|
||||
return Success(msg="Updated Successfully")
|
||||
|
||||
|
||||
@router.delete("/delete", summary="删除角色")
|
||||
async def delete_role(
|
||||
role_id: int = Query(..., description="角色ID"),
|
||||
):
|
||||
await role_controller.remove(id=role_id)
|
||||
return Success(msg="Deleted Success")
|
||||
|
||||
|
||||
@router.get("/authorized", summary="查看角色权限")
|
||||
async def get_role_authorized(id: int = Query(..., description="角色ID")):
|
||||
role_obj = await role_controller.get(id=id)
|
||||
data = await role_obj.to_dict(m2m=True)
|
||||
return Success(data=data)
|
||||
|
||||
|
||||
@router.post("/authorized", summary="更新角色权限")
|
||||
async def update_role_authorized(role_in: RoleUpdateMenusApis):
|
||||
role_obj = await role_controller.get(id=role_in.id)
|
||||
await role_controller.update_roles(role=role_obj, menu_ids=role_in.menu_ids, api_infos=role_in.api_infos)
|
||||
return Success(msg="Updated Successfully")
|
||||
13
app/api/v1/stats.py
Normal file
13
app/api/v1/stats.py
Normal file
@ -0,0 +1,13 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.core.scheduler import stats_job
|
||||
|
||||
|
||||
stats_router = APIRouter(tags=["统计任务"])
|
||||
|
||||
|
||||
@stats_router.get("/trigger", summary="触发统计并上报(无需鉴权)")
|
||||
async def trigger_stats():
|
||||
"""触发一次统计任务并执行上报与邮件通知"""
|
||||
await stats_job()
|
||||
return {"code": 200, "message": "统计任务已执行并尝试上报"}
|
||||
9
app/api/v1/token/__init__.py
Normal file
9
app/api/v1/token/__init__.py
Normal file
@ -0,0 +1,9 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .token import token_router as token_sub_router
|
||||
|
||||
token_router = APIRouter()
|
||||
token_router.include_router(token_sub_router, tags=["Token管理"])
|
||||
|
||||
__all__ = ["token_router"]
|
||||
|
||||
187
app/api/v1/token/token.py
Normal file
187
app/api/v1/token/token.py
Normal file
@ -0,0 +1,187 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Any, Dict, Tuple
|
||||
from fastapi import APIRouter, Query, Body, Path, BackgroundTasks
|
||||
from fastapi.background import P
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.controllers.token import token_controller
|
||||
from app.schemas.base import Fail, Success, SuccessExtra
|
||||
from app.schemas.token import BossTokenUpdate,BossTokenCreate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
token_router = APIRouter()
|
||||
|
||||
# 简单内存缓存:key 为查询参数组合,value 为 (缓存时间戳, 响应数据)
|
||||
_TOKENS_CACHE: Dict[Tuple[Any, Any, int, int], Tuple[float, Dict[str, Any]]] = {}
|
||||
_CACHE_TTL_SECONDS: int =60
|
||||
|
||||
|
||||
@token_router.get("/tokens", summary="获取Boss Token列表")
|
||||
async def list_boss_tokens(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
status: int = Query(None, description="状态筛选"),
|
||||
):
|
||||
"""获取Boss Token列表"""
|
||||
from tortoise.expressions import Q
|
||||
|
||||
q = Q()
|
||||
if status is not None:
|
||||
q &= Q(status=status)
|
||||
|
||||
total, token_objs = await token_controller.get_tokens(page=page, page_size=page_size, search=q)
|
||||
data = [await obj.to_dict() for obj in token_objs]
|
||||
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@token_router.get("/tokens/{token_id}", summary="获取Boss Token详情")
|
||||
async def get_boss_token(
|
||||
token_id: int = Path(..., description="Token ID"),
|
||||
):
|
||||
"""获取Boss Token详情"""
|
||||
token_obj = await token_controller.get_token(token_id)
|
||||
token_dict = await token_obj.to_dict()
|
||||
return Success(data=token_dict)
|
||||
|
||||
|
||||
@token_router.post("/tokens", summary="创建Boss Token")
|
||||
async def create_boss_token(
|
||||
token_data: BossTokenCreate = Body(..., description="Token数据"),
|
||||
):
|
||||
"""创建Boss Token"""
|
||||
await token_controller.create_token(token_data)
|
||||
# 清空缓存,确保新数据立即生效
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(msg="创建成功")
|
||||
|
||||
|
||||
@token_router.put("/tokens/{token_id}", summary="更新Boss Token")
|
||||
async def update_boss_token(
|
||||
token_id: int = Path(..., description="Token ID"),
|
||||
token_data: BossTokenUpdate = Body(..., description="Token数据"),
|
||||
):
|
||||
"""更新Boss Token"""
|
||||
await token_controller.update_token(token_id, token_data)
|
||||
# 清空缓存,确保更新立即生效
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(msg="更新成功")
|
||||
|
||||
|
||||
@token_router.delete("/tokens/{token_id}", summary="删除Boss Token")
|
||||
async def delete_boss_token(
|
||||
token_id: int = Path(..., description="Token ID"),
|
||||
):
|
||||
"""删除Boss Token"""
|
||||
await token_controller.delete_token(token_id)
|
||||
# 清空缓存,确保删除立即生效
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(msg="删除成功")
|
||||
|
||||
|
||||
@token_router.post("/tokens/cache/clear", summary="强制清除Token缓存")
|
||||
async def clear_token_cache():
|
||||
"""强制清除Token列表缓存"""
|
||||
global _TOKENS_CACHE
|
||||
cache_size = len(_TOKENS_CACHE)
|
||||
_TOKENS_CACHE.clear()
|
||||
logger.info(f"手动清除Token缓存,清除了 {cache_size} 条缓存数据")
|
||||
return Success(msg=f"成功清除 {cache_size} 条Token缓存")
|
||||
from typing import Optional, Dict, Any
|
||||
from fastapi import APIRouter, Query, HTTPException
|
||||
from tortoise.transactions import in_transaction
|
||||
from app.models.token import BossToken
|
||||
from app.schemas.base import Success
|
||||
|
||||
token_router = APIRouter()
|
||||
|
||||
|
||||
@token_router.get("/tokens")
|
||||
async def list_tokens(
|
||||
wt2: Optional[str] = Query(None),
|
||||
mpt: Optional[str] = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
page_size: int = Query(10, ge=1, le=200),
|
||||
):
|
||||
"""获取 BossToken 列表,带两小时内存缓存。
|
||||
|
||||
Args:
|
||||
wt2 (Optional[str]): 按 `wt2` 模糊匹配。
|
||||
mpt (Optional[str]): 按 `mpt` 模糊匹配。
|
||||
page (int): 页码。
|
||||
page_size (int): 每页数量。
|
||||
|
||||
Returns:
|
||||
Dict[str, Any]: 响应字典,包含 `code`、`data`、`total`。
|
||||
"""
|
||||
cache_key: Tuple[Any, Any, int, int] = (wt2, mpt, page, page_size)
|
||||
now = time.monotonic()
|
||||
cached = _TOKENS_CACHE.get(cache_key)
|
||||
if cached and (now - cached[0] < _CACHE_TTL_SECONDS):
|
||||
return cached[1]
|
||||
|
||||
qs = BossToken.all()
|
||||
if wt2:
|
||||
qs = qs.filter(wt2__icontains=wt2)
|
||||
if mpt:
|
||||
qs = qs.filter(mpt__icontains=mpt)
|
||||
total = await qs.count()
|
||||
items = await qs.order_by("-id").offset((page - 1) * page_size).limit(page_size)
|
||||
data = [
|
||||
{
|
||||
"id": item.id,
|
||||
"wt2": item.wt2,
|
||||
"mpt": item.mpt,
|
||||
"is_active": item.is_active,
|
||||
"failed_count": item.failed_count,
|
||||
"last_used_time": item.last_used_time,
|
||||
"created_at": item.created_at,
|
||||
}
|
||||
for item in items
|
||||
]
|
||||
resp: Dict[str, Any] = {"code": 200, "data": data, "total": total}
|
||||
_TOKENS_CACHE[cache_key] = (now, resp)
|
||||
return resp
|
||||
|
||||
|
||||
@token_router.post("/tokens")
|
||||
async def create_token(payload: Dict[str, Any]):
|
||||
try:
|
||||
async with in_transaction():
|
||||
item = await BossToken.create(
|
||||
wt2=payload.get("wt2"),
|
||||
mpt=payload.get("mpt"),
|
||||
is_active=bool(payload.get("is_active", True)),
|
||||
failed_count=int(payload.get("failed_count", 0)),
|
||||
last_used_time=payload.get("last_used_time"),
|
||||
)
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(data={"id": item.id})
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=400, detail=str(e))
|
||||
|
||||
|
||||
@token_router.put("/tokens/{id}")
|
||||
async def update_token(id: int, payload: Dict[str, Any]):
|
||||
token_id = id
|
||||
item = await BossToken.get_or_none(id=token_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Token not found")
|
||||
for field in ["wt2", "mpt", "is_active", "failed_count", "last_used_time"]:
|
||||
if field in payload:
|
||||
setattr(item, field, payload[field])
|
||||
await item.save()
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(data={"id": item.id})
|
||||
|
||||
|
||||
@token_router.delete("/tokens/{token_id}")
|
||||
async def delete_token(token_id: int):
|
||||
item = await BossToken.get_or_none(id=token_id)
|
||||
if not item:
|
||||
raise HTTPException(status_code=404, detail="Token not found")
|
||||
await item.delete()
|
||||
_TOKENS_CACHE.clear()
|
||||
return Success(data={"id": token_id})
|
||||
8
app/api/v1/users/__init__.py
Normal file
8
app/api/v1/users/__init__.py
Normal file
@ -0,0 +1,8 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .users import router
|
||||
|
||||
users_router = APIRouter()
|
||||
users_router.include_router(router, tags=["用户模块"])
|
||||
|
||||
__all__ = ["users_router"]
|
||||
81
app/api/v1/users/users.py
Normal file
81
app/api/v1/users/users.py
Normal file
@ -0,0 +1,81 @@
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter, Body, Query
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.controllers.dept import dept_controller
|
||||
from app.controllers.user import user_controller
|
||||
from app.schemas.base import Fail, Success, SuccessExtra
|
||||
from app.schemas.users import *
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/list", summary="查看用户列表")
|
||||
async def list_user(
|
||||
page: int = Query(1, description="页码"),
|
||||
page_size: int = Query(10, description="每页数量"),
|
||||
username: str = Query("", description="用户名称,用于搜索"),
|
||||
email: str = Query("", description="邮箱地址"),
|
||||
dept_id: int = Query(None, description="部门ID"),
|
||||
):
|
||||
q = Q()
|
||||
if username:
|
||||
q &= Q(username__contains=username)
|
||||
if email:
|
||||
q &= Q(email__contains=email)
|
||||
if dept_id is not None:
|
||||
q &= Q(dept_id=dept_id)
|
||||
total, user_objs = await user_controller.list(page=page, page_size=page_size, search=q)
|
||||
data = [await obj.to_dict(m2m=True, exclude_fields=["password"]) for obj in user_objs]
|
||||
for item in data:
|
||||
dept_id = item.pop("dept_id", None)
|
||||
item["dept"] = await (await dept_controller.get(id=dept_id)).to_dict() if dept_id else {}
|
||||
|
||||
return SuccessExtra(data=data, total=total, page=page, page_size=page_size)
|
||||
|
||||
|
||||
@router.get("/get", summary="查看用户")
|
||||
async def get_user(
|
||||
user_id: int = Query(..., description="用户ID"),
|
||||
):
|
||||
user_obj = await user_controller.get(id=user_id)
|
||||
user_dict = await user_obj.to_dict(exclude_fields=["password"])
|
||||
return Success(data=user_dict)
|
||||
|
||||
|
||||
@router.post("/create", summary="创建用户")
|
||||
async def create_user(
|
||||
user_in: UserCreate,
|
||||
):
|
||||
user = await user_controller.get_by_email(user_in.email)
|
||||
if user:
|
||||
return Fail(code=400, msg="The user with this email already exists in the system.")
|
||||
new_user = await user_controller.create_user(obj_in=user_in)
|
||||
await user_controller.update_roles(new_user, user_in.role_ids)
|
||||
return Success(msg="Created Successfully")
|
||||
|
||||
|
||||
@router.post("/update", summary="更新用户")
|
||||
async def update_user(
|
||||
user_in: UserUpdate,
|
||||
):
|
||||
user = await user_controller.update(id=user_in.id, obj_in=user_in)
|
||||
await user_controller.update_roles(user, user_in.role_ids)
|
||||
return Success(msg="Updated Successfully")
|
||||
|
||||
|
||||
@router.delete("/delete", summary="删除用户")
|
||||
async def delete_user(
|
||||
user_id: int = Query(..., description="用户ID"),
|
||||
):
|
||||
await user_controller.remove(id=user_id)
|
||||
return Success(msg="Deleted Successfully")
|
||||
|
||||
|
||||
@router.post("/reset_password", summary="重置密码")
|
||||
async def reset_password(user_id: int = Body(..., description="用户ID", embed=True)):
|
||||
await user_controller.reset_password(user_id)
|
||||
return Success(msg="密码已重置为123456")
|
||||
2
app/controllers/__init__.py
Normal file
2
app/controllers/__init__.py
Normal file
@ -0,0 +1,2 @@
|
||||
from .role import role_controller as role_controller
|
||||
from .user import user_controller as user_controller
|
||||
45
app/controllers/api.py
Normal file
45
app/controllers/api.py
Normal file
@ -0,0 +1,45 @@
|
||||
from fastapi.routing import APIRoute
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.log import logger
|
||||
from app.models.admin import Api
|
||||
from app.schemas.apis import ApiCreate, ApiUpdate
|
||||
|
||||
|
||||
class ApiController(CRUDBase[Api, ApiCreate, ApiUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=Api)
|
||||
|
||||
async def refresh_api(self):
|
||||
from app import app
|
||||
|
||||
# 删除废弃API数据
|
||||
all_api_list = []
|
||||
for route in app.routes:
|
||||
# 只更新有鉴权的API
|
||||
if isinstance(route, APIRoute) and len(route.dependencies) > 0:
|
||||
all_api_list.append((list(route.methods)[0], route.path_format))
|
||||
delete_api = []
|
||||
for api in await Api.all():
|
||||
if (api.method, api.path) not in all_api_list:
|
||||
delete_api.append((api.method, api.path))
|
||||
for item in delete_api:
|
||||
method, path = item
|
||||
logger.debug(f"API Deleted {method} {path}")
|
||||
await Api.filter(method=method, path=path).delete()
|
||||
|
||||
for route in app.routes:
|
||||
if isinstance(route, APIRoute) and len(route.dependencies) > 0:
|
||||
method = list(route.methods)[0]
|
||||
path = route.path_format
|
||||
summary = route.summary
|
||||
tags = list(route.tags)[0]
|
||||
api_obj = await Api.filter(method=method, path=path).first()
|
||||
if api_obj:
|
||||
await api_obj.update_from_dict(dict(method=method, path=path, summary=summary, tags=tags)).save()
|
||||
else:
|
||||
logger.debug(f"API Created {method} {path}")
|
||||
await Api.create(**dict(method=method, path=path, summary=summary, tags=tags))
|
||||
|
||||
|
||||
api_controller = ApiController()
|
||||
9
app/controllers/cleaning.py
Normal file
9
app/controllers/cleaning.py
Normal file
@ -0,0 +1,9 @@
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.cleaning import CleaningTask
|
||||
from app.schemas.cleaning import CleaningTaskCreate, CleaningTaskUpdate
|
||||
|
||||
class CleaningController(CRUDBase[CleaningTask, CleaningTaskCreate, CleaningTaskUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=CleaningTask)
|
||||
|
||||
cleaning_controller = CleaningController()
|
||||
86
app/controllers/dept.py
Normal file
86
app/controllers/dept.py
Normal file
@ -0,0 +1,86 @@
|
||||
from tortoise.expressions import Q
|
||||
from tortoise.transactions import atomic
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.admin import Dept, DeptClosure
|
||||
from app.schemas.depts import DeptCreate, DeptUpdate
|
||||
|
||||
|
||||
class DeptController(CRUDBase[Dept, DeptCreate, DeptUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=Dept)
|
||||
|
||||
async def get_dept_tree(self, name):
|
||||
q = Q()
|
||||
# 获取所有未被软删除的部门
|
||||
q &= Q(is_deleted=False)
|
||||
if name:
|
||||
q &= Q(name__contains=name)
|
||||
all_depts = await self.model.filter(q).order_by("order")
|
||||
|
||||
# 辅助函数,用于递归构建部门树
|
||||
def build_tree(parent_id):
|
||||
return [
|
||||
{
|
||||
"id": dept.id,
|
||||
"name": dept.name,
|
||||
"desc": dept.desc,
|
||||
"order": dept.order,
|
||||
"parent_id": dept.parent_id,
|
||||
"children": build_tree(dept.id), # 递归构建子部门
|
||||
}
|
||||
for dept in all_depts
|
||||
if dept.parent_id == parent_id
|
||||
]
|
||||
|
||||
# 从顶级部门(parent_id=0)开始构建部门树
|
||||
dept_tree = build_tree(0)
|
||||
return dept_tree
|
||||
|
||||
async def get_dept_info(self):
|
||||
pass
|
||||
|
||||
async def update_dept_closure(self, obj: Dept):
|
||||
parent_depts = await DeptClosure.filter(descendant=obj.parent_id)
|
||||
for i in parent_depts:
|
||||
pass
|
||||
dept_closure_objs: list[DeptClosure] = []
|
||||
# 插入父级关系
|
||||
for item in parent_depts:
|
||||
dept_closure_objs.append(DeptClosure(ancestor=item.ancestor, descendant=obj.id, level=item.level + 1))
|
||||
# 插入自身x
|
||||
dept_closure_objs.append(DeptClosure(ancestor=obj.id, descendant=obj.id, level=0))
|
||||
# 创建关系
|
||||
await DeptClosure.bulk_create(dept_closure_objs)
|
||||
|
||||
@atomic()
|
||||
async def create_dept(self, obj_in: DeptCreate):
|
||||
# 创建
|
||||
if obj_in.parent_id != 0:
|
||||
await self.get(id=obj_in.parent_id)
|
||||
new_obj = await self.create(obj_in=obj_in)
|
||||
await self.update_dept_closure(new_obj)
|
||||
|
||||
@atomic()
|
||||
async def update_dept(self, obj_in: DeptUpdate):
|
||||
dept_obj = await self.get(id=obj_in.id)
|
||||
# 更新部门关系
|
||||
if dept_obj.parent_id != obj_in.parent_id:
|
||||
await DeptClosure.filter(ancestor=dept_obj.id).delete()
|
||||
await DeptClosure.filter(descendant=dept_obj.id).delete()
|
||||
await self.update_dept_closure(dept_obj)
|
||||
# 更新部门信息
|
||||
dept_obj.update_from_dict(obj_in.model_dump(exclude_unset=True))
|
||||
await dept_obj.save()
|
||||
|
||||
@atomic()
|
||||
async def delete_dept(self, dept_id: int):
|
||||
# 删除部门
|
||||
obj = await self.get(id=dept_id)
|
||||
obj.is_deleted = True
|
||||
await obj.save()
|
||||
# 删除关系
|
||||
await DeptClosure.filter(descendant=dept_id).delete()
|
||||
|
||||
|
||||
dept_controller = DeptController()
|
||||
224
app/controllers/job.py
Normal file
224
app/controllers/job.py
Normal file
@ -0,0 +1,224 @@
|
||||
from typing import Dict, Any, List, Optional
|
||||
from fastapi import HTTPException, BackgroundTasks
|
||||
from app.services.job import DataRouterService, DataType, PlatformType
|
||||
from app.log import logger
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class UniversalDataRequest(BaseModel):
|
||||
"""通用数据存储请求模型"""
|
||||
data: Dict[str, Any] = Field(..., description="要存储的数据")
|
||||
data_type: DataType = Field(..., description="数据类型 (job/company)")
|
||||
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
|
||||
check_duplicate: bool = Field(True, description="是否检查重复数据")
|
||||
|
||||
|
||||
class BatchDataRequest(BaseModel):
|
||||
"""批量数据存储请求模型"""
|
||||
data_list: List[Dict[str, Any]] = Field(..., description="要存储的数据列表")
|
||||
data_type: DataType = Field(..., description="数据类型 (job/company)")
|
||||
platform: PlatformType = Field(..., description="平台类型 (boss/qcwy/zhilian)")
|
||||
check_duplicate: bool = Field(True, description="是否检查重复数据")
|
||||
|
||||
|
||||
class UniversalDataController:
|
||||
"""通用数据控制器 - 处理所有平台的数据存储请求"""
|
||||
|
||||
def __init__(self, data_router_service: DataRouterService):
|
||||
self.data_router_service = data_router_service
|
||||
|
||||
async def store_single_data(self, request: UniversalDataRequest) -> Dict[str, Any]:
|
||||
"""存储单条数据"""
|
||||
try:
|
||||
# logger.info(f"接收到 {request.platform} {request.data_type} 数据存储请求")
|
||||
|
||||
result = await self.data_router_service.store_data(
|
||||
data=request.data,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200 if result["success"] else 400,
|
||||
"message": result["message"],
|
||||
"data": result,
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"存储单条数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"数据存储失败: {str(e)}")
|
||||
|
||||
async def store_batch_data(self, request: BatchDataRequest) -> Dict[str, Any]:
|
||||
"""批量存储数据"""
|
||||
try:
|
||||
# logger.info(
|
||||
# f"接收到 {request.platform} {request.data_type} 批量数据存储请求,共 {len(request.data_list)} 条")
|
||||
|
||||
result = await self.data_router_service.batch_store_data(
|
||||
data_list=request.data_list,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": f"批量处理完成: 成功 {result['success']} 条,失败 {result['failed']} 条,重复 {result['duplicate']} 条",
|
||||
"data": result,
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"批量存储数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"批量数据存储失败: {str(e)}")
|
||||
|
||||
async def store_single_data_async(self,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: UniversalDataRequest) -> Dict[str, Any]:
|
||||
"""异步存储单条数据"""
|
||||
try:
|
||||
# logger.info(f"接收到 {request.platform} {request.data_type} 异步数据存储请求")
|
||||
|
||||
# 添加后台任务
|
||||
background_tasks.add_task(
|
||||
self._async_store_single_data,
|
||||
request
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 202,
|
||||
"message": "数据已加入异步处理队列",
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步存储单条数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"异步数据存储失败: {str(e)}")
|
||||
|
||||
async def store_batch_data_async(self,
|
||||
background_tasks: BackgroundTasks,
|
||||
request: BatchDataRequest) -> Dict[str, Any]:
|
||||
"""异步批量存储数据"""
|
||||
try:
|
||||
# 打印接收日志
|
||||
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
|
||||
logger.info(f"📥 收到批量请求: [{platform_name}] {request.data_type.value} x{len(request.data_list)} 条")
|
||||
|
||||
# 添加后台任务
|
||||
background_tasks.add_task(
|
||||
self._async_store_batch_data,
|
||||
request
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 202,
|
||||
"message": f"批量数据已加入异步处理队列,共 {len(request.data_list)} 条",
|
||||
"platform": request.platform,
|
||||
"data_type": request.data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步批量存储数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"异步批量数据存储失败: {str(e)}")
|
||||
|
||||
async def _async_store_single_data(self, request: UniversalDataRequest):
|
||||
"""异步存储单条数据的后台任务"""
|
||||
try:
|
||||
|
||||
result = await self.data_router_service.store_data(
|
||||
data=request.data,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
if result["success"]:
|
||||
logger.info(f"异步存储 {request.platform} {request.data_type} 数据成功")
|
||||
else:
|
||||
logger.warning(f"异步存储 {request.platform} {request.data_type} 数据失败: {result['message']}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步存储单条数据后台任务失败: {str(e)}")
|
||||
|
||||
async def _async_store_batch_data(self, request: BatchDataRequest):
|
||||
"""异步批量存储数据的后台任务"""
|
||||
try:
|
||||
platform_name = {"boss": "Boss直聘", "qcwy": "前程无忧", "zhilian": "智联招聘"}.get(request.platform.value, request.platform.value)
|
||||
|
||||
result = await self.data_router_service.batch_store_data(
|
||||
data_list=request.data_list,
|
||||
data_type=request.data_type,
|
||||
platform=request.platform,
|
||||
check_duplicate=request.check_duplicate
|
||||
)
|
||||
|
||||
logger.info(f"✅ 批量处理完成: [{platform_name}] 成功 {result['success']} 条, 重复 {result['duplicate']} 条, 失败 {result['failed']} 条")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"异步批量存储数据后台任务失败: {str(e)}")
|
||||
|
||||
async def query_data(self, platform: PlatformType, data_type: DataType,
|
||||
page: int = 1, page_size: int = 20) -> Dict[str, Any]:
|
||||
"""查询数据"""
|
||||
try:
|
||||
logger.info(f"查询 {platform} {data_type} 数据,页码: {page}, 页大小: {page_size}")
|
||||
|
||||
offset = (page - 1) * page_size
|
||||
result = await self.data_router_service.query_json_data(
|
||||
platform=platform,
|
||||
data_type=data_type,
|
||||
limit=page_size,
|
||||
offset=offset
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "查询数据成功",
|
||||
"data": {
|
||||
"items": result.get("data", []),
|
||||
"total": result.get("count", 0),
|
||||
"page": page,
|
||||
"page_size": page_size
|
||||
},
|
||||
"platform": platform,
|
||||
"data_type": data_type
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"查询数据失败: {str(e)}")
|
||||
raise HTTPException(status_code=500, detail=f"查询数据失败: {str(e)}")
|
||||
|
||||
async def get_supported_platforms(self) -> Dict[str, Any]:
|
||||
"""获取支持的平台和数据类型"""
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "获取支持的平台和数据类型成功",
|
||||
"data": {
|
||||
"platforms": [platform.value for platform in PlatformType],
|
||||
"data_types": [data_type.value for data_type in DataType],
|
||||
"platform_duplicate_keys": {
|
||||
"boss": {
|
||||
"job": "job_id",
|
||||
"company": "company_name"
|
||||
},
|
||||
"qcwy": {
|
||||
"job": "job_id + update_date_time",
|
||||
"company": "company_name"
|
||||
},
|
||||
"zhilian": {
|
||||
"job": "number + first_publish_time",
|
||||
"company": "company_name"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# 创建控制器实例的工厂函数
|
||||
def create_universal_data_controller(data_router_service: DataRouterService) -> UniversalDataController:
|
||||
return UniversalDataController(data_router_service)
|
||||
332
app/controllers/keyword.py
Normal file
332
app/controllers/keyword.py
Normal file
@ -0,0 +1,332 @@
|
||||
from datetime import date, datetime
|
||||
import random
|
||||
from typing import Any, Dict, List, Type
|
||||
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.keyword import BossKeyword, QcwyKeyword, ZhilianKeyword
|
||||
|
||||
|
||||
class KeywordController:
|
||||
def __init__(self) -> None:
|
||||
self._model_map: Dict[str, Type] = {
|
||||
"boss": BossKeyword,
|
||||
"qcwy": QcwyKeyword,
|
||||
"zhilian": ZhilianKeyword,
|
||||
}
|
||||
|
||||
async def get_available(self, source: str, limit: int = 1, reserve: bool = True) -> Dict[str, Any]:
|
||||
"""获取当天未使用的检索条件(城市+岗位)
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
limit: 返回数量上限
|
||||
reserve: 是否立即标记为已使用
|
||||
|
||||
返回:
|
||||
包含 items/total/limit 的字典结构
|
||||
|
||||
注意:使用原子操作避免并发时的竞态条件
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
today = date.today()
|
||||
now = datetime.now()
|
||||
|
||||
# 先统计总数
|
||||
search = Q(last_requested_date__not=today) | Q(last_requested_date=None)
|
||||
total = await model.filter(search).count()
|
||||
items = []
|
||||
|
||||
if total > 0 and reserve:
|
||||
# 使用原子操作:先更新,再查询已更新的记录
|
||||
# 这样可以避免查询和标记之间的竞态条件
|
||||
take = max(1, min(limit, total))
|
||||
|
||||
try:
|
||||
# 获取一批未使用的记录ID(随机选择)
|
||||
candidate_records = await model.filter(search).offset(
|
||||
random.randint(0, max(0, total - take))
|
||||
).limit(take).only('id')
|
||||
|
||||
candidate_ids = [r.id for r in candidate_records]
|
||||
|
||||
if candidate_ids:
|
||||
# 原子性地更新这些记录(只更新未使用的)
|
||||
# 使用数据库的原子UPDATE操作
|
||||
updated_count = await model.filter(
|
||||
id__in=candidate_ids
|
||||
).filter(
|
||||
Q(last_requested_date__isnull=True) | Q(last_requested_date__not=today)
|
||||
).update(
|
||||
last_requested_date=today,
|
||||
last_requested_at=now
|
||||
)
|
||||
|
||||
# 查询成功更新的记录
|
||||
if updated_count > 0:
|
||||
records = await model.filter(
|
||||
id__in=candidate_ids,
|
||||
last_requested_date=today
|
||||
).limit(updated_count)
|
||||
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
|
||||
except Exception as e:
|
||||
# 如果原子操作失败,回退到原来的方法
|
||||
import logging
|
||||
logging.warning(f"原子操作失败,回退到原方法: {e}")
|
||||
take = max(1, min(limit, total))
|
||||
start = 0 if total == take else random.randint(0, total - take)
|
||||
records = await model.filter(search).offset(start).limit(take)
|
||||
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
|
||||
if reserve:
|
||||
ids = [r.id for r in records]
|
||||
await self.mark_used(source, ids)
|
||||
elif total > 0:
|
||||
# 如果不需要reserve,直接查询
|
||||
take = max(1, min(limit, total))
|
||||
start = 0 if total == take else random.randint(0, total - take)
|
||||
records = await model.filter(search).offset(start).limit(take)
|
||||
items = [{"id": r.id, "city": r.city, "job": r.job} for r in records]
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "查询可用检索条件成功",
|
||||
"data": {
|
||||
"items": items,
|
||||
"total": total,
|
||||
"limit": limit,
|
||||
},
|
||||
}
|
||||
|
||||
async def get_stats(self, source: str, on_date: date | None = None) -> Dict[str, Any]:
|
||||
"""统计指定平台在某日期的使用与未使用数量
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
on_date: 统计日期,不传则为今天
|
||||
|
||||
返回:
|
||||
包含 total/used/unused 的字典结构
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
d = on_date or date.today()
|
||||
total = await model.all().count()
|
||||
used = await model.filter(last_requested_date=d).count()
|
||||
unused = max(0, total - used)
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "统计成功",
|
||||
"data": {
|
||||
"date": str(d),
|
||||
"total": total,
|
||||
"used": used,
|
||||
"unused": unused,
|
||||
},
|
||||
}
|
||||
|
||||
async def mark_used(self, source: str, ids: List[int]) -> Dict[str, Any]:
|
||||
"""将检索条件标记为今日已使用
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
ids: 需要标记的记录主键ID列表
|
||||
|
||||
返回:
|
||||
更新结果,包括成功条数与日期
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
updated = 0
|
||||
now = datetime.now()
|
||||
today = date.today()
|
||||
for rid in ids:
|
||||
obj = await model.filter(id=rid).first()
|
||||
if obj is None:
|
||||
continue
|
||||
if obj.last_requested_date == today:
|
||||
continue
|
||||
obj.last_requested_date = today
|
||||
obj.last_requested_at = now
|
||||
await obj.save()
|
||||
updated += 1
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "状态更新完成",
|
||||
"data": {
|
||||
"updated": updated,
|
||||
"ids": ids,
|
||||
"date": str(today),
|
||||
},
|
||||
}
|
||||
|
||||
async def list_keywords(
|
||||
self,
|
||||
source: str,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
city: str | None = None,
|
||||
job: str | None = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""获取关键词列表
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
page: 页码
|
||||
page_size: 每页数量
|
||||
city: 城市过滤
|
||||
job: 职位过滤
|
||||
|
||||
返回:
|
||||
包含列表数据和分页信息的字典
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
queryset = model.all()
|
||||
if city:
|
||||
queryset = queryset.filter(city__icontains=city)
|
||||
if job:
|
||||
queryset = queryset.filter(job__icontains=job)
|
||||
|
||||
total = await queryset.count()
|
||||
queryset = queryset.order_by("-id").offset((page - 1) * page_size).limit(page_size)
|
||||
items = await queryset.values(
|
||||
"id",
|
||||
"city",
|
||||
"job",
|
||||
"last_requested_date",
|
||||
"last_requested_at",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
)
|
||||
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "获取成功",
|
||||
"data": items,
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
}
|
||||
|
||||
async def create_keyword(self, source: str, obj_in: Any) -> Dict[str, Any]:
|
||||
"""创建关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
obj_in: 创建数据对象
|
||||
|
||||
返回:
|
||||
创建结果
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
# Check if already exists
|
||||
exists = await model.filter(city=obj_in.city, job=obj_in.job).exists()
|
||||
if exists:
|
||||
return {"code": 400, "message": "该关键词组合已存在"}
|
||||
|
||||
obj = await model.create(**obj_in.model_dump())
|
||||
data = {
|
||||
"id": obj.id,
|
||||
"city": obj.city,
|
||||
"job": obj.job,
|
||||
"last_requested_date": obj.last_requested_date,
|
||||
"last_requested_at": obj.last_requested_at,
|
||||
"created_at": obj.created_at,
|
||||
"updated_at": obj.updated_at,
|
||||
}
|
||||
return {"code": 200, "message": "创建成功", "data": data}
|
||||
|
||||
async def update_keyword(self, source: str, id: int, obj_in: Any) -> Dict[str, Any]:
|
||||
"""更新关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
id: 记录ID
|
||||
obj_in: 更新数据对象
|
||||
|
||||
返回:
|
||||
更新结果
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
obj = await model.filter(id=id).first()
|
||||
if not obj:
|
||||
return {"code": 404, "message": "记录不存在"}
|
||||
|
||||
update_data = obj_in.model_dump(exclude_unset=True)
|
||||
if update_data:
|
||||
# Check for duplicates if updating city or job
|
||||
if "city" in update_data or "job" in update_data:
|
||||
city = update_data.get("city", obj.city)
|
||||
job = update_data.get("job", obj.job)
|
||||
exists = await model.filter(city=city, job=job).exclude(id=id).exists()
|
||||
if exists:
|
||||
return {"code": 400, "message": "该关键词组合已存在"}
|
||||
|
||||
await obj.update_from_dict(update_data)
|
||||
await obj.save()
|
||||
|
||||
data = {
|
||||
"id": obj.id,
|
||||
"city": obj.city,
|
||||
"job": obj.job,
|
||||
"last_requested_date": obj.last_requested_date,
|
||||
"last_requested_at": obj.last_requested_at,
|
||||
"created_at": obj.created_at,
|
||||
"updated_at": obj.updated_at,
|
||||
}
|
||||
return {"code": 200, "message": "更新成功", "data": data}
|
||||
|
||||
async def delete_keyword(self, source: str, id: int) -> Dict[str, Any]:
|
||||
"""删除关键词
|
||||
|
||||
参数:
|
||||
source: 平台标识
|
||||
id: 记录ID
|
||||
|
||||
返回:
|
||||
删除结果
|
||||
"""
|
||||
model = self._ensure_model(source)
|
||||
obj = await model.filter(id=id).first()
|
||||
if not obj:
|
||||
return {"code": 404, "message": "记录不存在"}
|
||||
|
||||
await obj.delete()
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "删除成功",
|
||||
}
|
||||
|
||||
async def get_overview_stats(self) -> Dict[str, Any]:
|
||||
"""获取所有平台的统计概览
|
||||
|
||||
返回:
|
||||
包含各平台统计数据的字典
|
||||
"""
|
||||
today = date.today()
|
||||
stats = {}
|
||||
for source, model in self._model_map.items():
|
||||
total = await model.all().count()
|
||||
used = await model.filter(last_requested_date=today).count()
|
||||
stats[source] = {
|
||||
"total": total,
|
||||
"used": used,
|
||||
"unused": max(0, total - used),
|
||||
}
|
||||
return {
|
||||
"code": 200,
|
||||
"message": "获取概览统计成功",
|
||||
"data": stats,
|
||||
}
|
||||
|
||||
def _ensure_model(self, source: str) -> Type:
|
||||
"""根据平台标识返回对应模型类型
|
||||
|
||||
参数:
|
||||
source: 平台标识,取值为 boss|qcwy|zhilian
|
||||
|
||||
返回:
|
||||
对应的 Tortoise ORM 模型类型
|
||||
"""
|
||||
model = self._model_map.get(source)
|
||||
if not model:
|
||||
raise ValueError("不支持的平台标识")
|
||||
return model
|
||||
16
app/controllers/menu.py
Normal file
16
app/controllers/menu.py
Normal file
@ -0,0 +1,16 @@
|
||||
from typing import Optional
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.admin import Menu
|
||||
from app.schemas.menus import MenuCreate, MenuUpdate
|
||||
|
||||
|
||||
class MenuController(CRUDBase[Menu, MenuCreate, MenuUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=Menu)
|
||||
|
||||
async def get_by_menu_path(self, path: str) -> Optional["Menu"]:
|
||||
return await self.model.filter(path=path).first()
|
||||
|
||||
|
||||
menu_controller = MenuController()
|
||||
20
app/controllers/proxy.py
Normal file
20
app/controllers/proxy.py
Normal file
@ -0,0 +1,20 @@
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.cleaning import ProxyConfig, ProxyProvider
|
||||
from app.schemas.proxy import ProxyConfigCreate, ProxyConfigUpdate
|
||||
from app.schemas.proxy_provider import ProxyProviderCreate, ProxyProviderUpdate
|
||||
|
||||
|
||||
class ProxyController(CRUDBase[ProxyConfig, ProxyConfigCreate, ProxyConfigUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=ProxyConfig)
|
||||
|
||||
|
||||
proxy_controller = ProxyController()
|
||||
|
||||
|
||||
class ProxyProviderController(CRUDBase[ProxyProvider, ProxyProviderCreate, ProxyProviderUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=ProxyProvider)
|
||||
|
||||
|
||||
proxy_provider_controller = ProxyProviderController()
|
||||
27
app/controllers/role.py
Normal file
27
app/controllers/role.py
Normal file
@ -0,0 +1,27 @@
|
||||
from typing import List
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.admin import Api, Menu, Role
|
||||
from app.schemas.roles import RoleCreate, RoleUpdate
|
||||
|
||||
|
||||
class RoleController(CRUDBase[Role, RoleCreate, RoleUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=Role)
|
||||
|
||||
async def is_exist(self, name: str) -> bool:
|
||||
return await self.model.filter(name=name).exists()
|
||||
|
||||
async def update_roles(self, role: Role, menu_ids: List[int], api_infos: List[dict]) -> None:
|
||||
await role.menus.clear()
|
||||
for menu_id in menu_ids:
|
||||
menu_obj = await Menu.filter(id=menu_id).first()
|
||||
await role.menus.add(menu_obj)
|
||||
|
||||
await role.apis.clear()
|
||||
for item in api_infos:
|
||||
api_obj = await Api.filter(path=item.get("path"), method=item.get("method")).first()
|
||||
await role.apis.add(api_obj)
|
||||
|
||||
|
||||
role_controller = RoleController()
|
||||
34
app/controllers/token.py
Normal file
34
app/controllers/token.py
Normal file
@ -0,0 +1,34 @@
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.token import BossToken
|
||||
from tortoise.expressions import Q
|
||||
|
||||
|
||||
class BossPlatform:
|
||||
"""Boss直聘平台操作类"""
|
||||
|
||||
def __init__(self):
|
||||
self.token_crud = CRUDBase(model=BossToken)
|
||||
|
||||
async def get_tokens(self, page: int = 1, page_size: int = 10, search: Q = None):
|
||||
"""获取Boss Token列表"""
|
||||
return await self.token_crud.list(page=page, page_size=page_size, search=search)
|
||||
|
||||
async def get_token(self, token_id: int):
|
||||
"""获取Boss Token详情"""
|
||||
return await self.token_crud.get(id=token_id)
|
||||
|
||||
async def create_token(self, obj_in: dict):
|
||||
"""创建Boss Token"""
|
||||
return await self.token_crud.create(obj_in)
|
||||
|
||||
async def update_token(self, token_id: int, obj_in: dict):
|
||||
"""更新Boss Token"""
|
||||
return await self.token_crud.update(id=token_id, obj_in=obj_in)
|
||||
|
||||
async def delete_token(self, token_id: int):
|
||||
"""删除Boss Token"""
|
||||
return await self.token_crud.remove(id=token_id)
|
||||
|
||||
|
||||
# 创建token_controller实例供API路由使用
|
||||
token_controller = BossPlatform()
|
||||
63
app/controllers/user.py
Normal file
63
app/controllers/user.py
Normal file
@ -0,0 +1,63 @@
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from fastapi.exceptions import HTTPException
|
||||
|
||||
from app.core.crud import CRUDBase
|
||||
from app.models.admin import User
|
||||
from app.schemas.login import CredentialsSchema
|
||||
from app.schemas.users import UserCreate, UserUpdate
|
||||
from app.utils.password import get_password_hash, verify_password
|
||||
|
||||
from .role import role_controller
|
||||
|
||||
|
||||
class UserController(CRUDBase[User, UserCreate, UserUpdate]):
|
||||
def __init__(self):
|
||||
super().__init__(model=User)
|
||||
|
||||
async def get_by_email(self, email: str) -> Optional[User]:
|
||||
return await self.model.filter(email=email).first()
|
||||
|
||||
async def get_by_username(self, username: str) -> Optional[User]:
|
||||
return await self.model.filter(username=username).first()
|
||||
|
||||
async def create_user(self, obj_in: UserCreate) -> User:
|
||||
obj_in.password = get_password_hash(password=obj_in.password)
|
||||
obj = await self.create(obj_in)
|
||||
return obj
|
||||
|
||||
async def update_last_login(self, id: int) -> None:
|
||||
user = await self.model.get(id=id)
|
||||
user.last_login = datetime.now()
|
||||
await user.save()
|
||||
|
||||
async def authenticate(self, credentials: CredentialsSchema) -> Optional["User"]:
|
||||
user = await self.model.filter(username=credentials.username).first()
|
||||
if not user:
|
||||
raise HTTPException(status_code=400, detail="无效的用户名")
|
||||
try:
|
||||
verified = verify_password(credentials.password, user.password)
|
||||
except Exception:
|
||||
raise HTTPException(status_code=500, detail="密码校验失败,请联系管理员安装或修复加密依赖")
|
||||
if not verified:
|
||||
raise HTTPException(status_code=400, detail="密码错误!")
|
||||
if not user.is_active:
|
||||
raise HTTPException(status_code=400, detail="用户已被禁用")
|
||||
return user
|
||||
|
||||
async def update_roles(self, user: User, role_ids: List[int]) -> None:
|
||||
await user.roles.clear()
|
||||
for role_id in role_ids:
|
||||
role_obj = await role_controller.get(id=role_id)
|
||||
await user.roles.add(role_obj)
|
||||
|
||||
async def reset_password(self, user_id: int):
|
||||
user_obj = await self.get(id=user_id)
|
||||
if user_obj.is_superuser:
|
||||
raise HTTPException(status_code=403, detail="不允许重置超级管理员密码")
|
||||
user_obj.password = get_password_hash(password="123456")
|
||||
await user_obj.save()
|
||||
|
||||
|
||||
user_controller = UserController()
|
||||
203
app/core/algorithms/antispider.py
Normal file
203
app/core/algorithms/antispider.py
Normal file
@ -0,0 +1,203 @@
|
||||
import time
|
||||
import os
|
||||
from typing import Dict, Any, Optional, List, Tuple
|
||||
import random
|
||||
|
||||
class IPStrategyConfig:
|
||||
def __init__(self,
|
||||
response_time_threshold_sec: int = 5,
|
||||
proxy_failure_threshold: int = 3,
|
||||
local_cooldown_sec: int = 1800,
|
||||
local_failure_threshold: int = 2):
|
||||
"""IP策略配置
|
||||
|
||||
Args:
|
||||
response_time_threshold_sec (int): 单次请求耗时阈值秒。
|
||||
proxy_failure_threshold (int): 同一代理连续失败触发切换阈值。
|
||||
local_cooldown_sec (int): 本机IP使用冷却时间秒。
|
||||
local_failure_threshold (int): 本机连续失败阈值,超过后回到代理池。
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.response_time_threshold_sec = response_time_threshold_sec
|
||||
self.proxy_failure_threshold = proxy_failure_threshold
|
||||
self.local_cooldown_sec = local_cooldown_sec
|
||||
self.local_failure_threshold = local_failure_threshold
|
||||
|
||||
def update(self, updates: Dict[str, Any]) -> None:
|
||||
"""动态更新配置"""
|
||||
for k, v in updates.items():
|
||||
if hasattr(self, k):
|
||||
setattr(self, k, v)
|
||||
|
||||
|
||||
class IPAnomalyDetector:
|
||||
def __init__(self, cfg: IPStrategyConfig):
|
||||
"""异常检测器
|
||||
|
||||
Args:
|
||||
cfg (IPStrategyConfig): 策略配置。
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.cfg = cfg
|
||||
|
||||
def detect(self, status_code: Optional[int], elapsed_sec: float, resp_json: Optional[Dict], error_text: str = "") -> Optional[str]:
|
||||
"""检测是否存在IP异常
|
||||
|
||||
Args:
|
||||
status_code (Optional[int]): 响应HTTP状态码;异常时可能为None。
|
||||
elapsed_sec (float): 响应耗时秒。
|
||||
resp_json (Optional[Dict]): 响应体JSON。
|
||||
error_text (str): 异常文本。
|
||||
|
||||
Returns:
|
||||
Optional[str]: 异常原因标识字符串;无异常返回None。
|
||||
"""
|
||||
if status_code in (403, 429, 407):
|
||||
return f"http_{status_code}"
|
||||
if elapsed_sec > self.cfg.response_time_threshold_sec:
|
||||
return "slow_response"
|
||||
if resp_json:
|
||||
msg = str(resp_json.get("message", ""))
|
||||
code = resp_json.get("code")
|
||||
if code == 35 or ("IP地址存在异常" in msg or ("IP" in msg and "异常" in msg)):
|
||||
return "ip_banned"
|
||||
if error_text and ("IP" in error_text and "异常" in error_text):
|
||||
return "ip_banned"
|
||||
return None
|
||||
|
||||
|
||||
class SmartIPManager:
|
||||
def __init__(self, proxy_pool: Optional[List[Dict[str, str]]], cfg: IPStrategyConfig):
|
||||
"""智能IP管理器
|
||||
|
||||
Args:
|
||||
proxy_pool (Optional[List[Dict[str,str]]]): 代理池列表,元素为requests兼容代理字典。
|
||||
cfg (IPStrategyConfig): 策略配置。
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
self.cfg = cfg
|
||||
self.proxy_pool: List[Dict[str, str]] = proxy_pool or []
|
||||
self.eliminated: set = set()
|
||||
self.current_mode: str = 'proxy' if self.proxy_pool else 'local'
|
||||
self.current_index: int = 0
|
||||
self.proxy_failures_current: int = 0
|
||||
self.local_failures: int = 0
|
||||
self.last_local_use_time: float = 0.0
|
||||
self.local_disabled_until: float = 0.0
|
||||
|
||||
def current_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
|
||||
"""返回当前路由模式和代理配置"""
|
||||
if self.current_mode == 'proxy' and self.proxy_pool:
|
||||
return 'proxy', self.proxy_pool[self.current_index]
|
||||
return 'local', None
|
||||
|
||||
def mark_success(self) -> None:
|
||||
"""请求成功后重置失败计数"""
|
||||
if self.current_mode == 'proxy':
|
||||
self.proxy_failures_current = 0
|
||||
else:
|
||||
self.local_failures = 0
|
||||
|
||||
def mark_failure(self, reason: str = "") -> None:
|
||||
"""请求失败后更新失败计数与淘汰状态"""
|
||||
if self.current_mode == 'proxy':
|
||||
self.proxy_failures_current += 1
|
||||
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
|
||||
self.eliminated.add(self.current_index)
|
||||
else:
|
||||
self.local_failures += 1
|
||||
|
||||
def select_next_route(self) -> Tuple[str, Optional[Dict[str, str]]]:
|
||||
"""选择下一个路由(代理或本机),避免无限本机循环"""
|
||||
now = time.monotonic()
|
||||
if self.current_mode == 'proxy':
|
||||
if self.proxy_failures_current >= self.cfg.proxy_failure_threshold:
|
||||
if self._local_available(now):
|
||||
self.current_mode = 'local'
|
||||
self.last_local_use_time = now
|
||||
self.proxy_failures_current = 0
|
||||
return 'local', None
|
||||
next_idx = self._next_proxy_index()
|
||||
if next_idx is not None:
|
||||
self.current_index = next_idx
|
||||
self.proxy_failures_current = 0
|
||||
return 'proxy', self.proxy_pool[self.current_index]
|
||||
self.current_mode = 'local'
|
||||
self.last_local_use_time = now
|
||||
self.proxy_failures_current = 0
|
||||
return 'local', None
|
||||
if self.proxy_pool:
|
||||
return 'proxy', self.proxy_pool[self.current_index]
|
||||
self.current_mode = 'local'
|
||||
return 'local', None
|
||||
else:
|
||||
if self.local_failures >= self.cfg.local_failure_threshold:
|
||||
next_idx = self._next_proxy_index()
|
||||
if next_idx is not None:
|
||||
self.current_mode = 'proxy'
|
||||
self.current_index = next_idx
|
||||
self.local_failures = 0
|
||||
return 'proxy', self.proxy_pool[self.current_index]
|
||||
return 'local', None
|
||||
|
||||
def _next_proxy_index(self) -> Optional[int]:
|
||||
"""查找下一个未被淘汰的代理索引"""
|
||||
if not self.proxy_pool:
|
||||
return None
|
||||
n = len(self.proxy_pool)
|
||||
for step in range(1, n + 1):
|
||||
cand = (self.current_index + step) % n
|
||||
if cand not in self.eliminated:
|
||||
return cand
|
||||
return None
|
||||
|
||||
def _local_available(self, now: float) -> bool:
|
||||
"""本机是否可用(冷却与禁用窗口判断)"""
|
||||
if now < self.local_disabled_until:
|
||||
return False
|
||||
return (now - self.last_local_use_time) >= self.cfg.local_cooldown_sec
|
||||
|
||||
def disable_local_temporarily(self, seconds: int) -> None:
|
||||
"""临时禁用本机IP"""
|
||||
self.local_disabled_until = time.monotonic() + max(0, seconds)
|
||||
|
||||
def manual_switch_to_proxy(self, index: int) -> None:
|
||||
"""人工指定代理索引"""
|
||||
if 0 <= index < len(self.proxy_pool) and index not in self.eliminated:
|
||||
self.current_mode = 'proxy'
|
||||
self.current_index = index
|
||||
self.proxy_failures_current = 0
|
||||
|
||||
def enable_local(self) -> None:
|
||||
"""重新允许本机IP"""
|
||||
self.local_disabled_until = 0.0
|
||||
|
||||
def generate_boss_trace_id() -> str:
|
||||
"""生成Boss直聘的trace_id
|
||||
基于Boss直聘官方算法:
|
||||
1. 获取当前时间戳的16进制表示,取后6位
|
||||
2. 生成10位随机字符串(包含数字、小写字母、大写字母)
|
||||
3. 拼接为 F-{timestamp_hex}{random_string} 格式
|
||||
"""
|
||||
# 获取当前时间戳的16进制表示,取后6位
|
||||
timestamp_hex = hex(int(time.time() * 1000))[2:][-6:]
|
||||
|
||||
# 字符集:数字 + 小写字母 + 大写字母
|
||||
charset = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
||||
|
||||
# 生成10位随机字符串
|
||||
random_string = ''.join(random.choice(charset) for _ in range(10))
|
||||
|
||||
# 拼接最终的traceid
|
||||
trace_id = f"F-{timestamp_hex}{random_string}"
|
||||
|
||||
return trace_id
|
||||
|
||||
def generate_token() -> str:
|
||||
chars = "0123456789abcdef"
|
||||
return ''.join(random.choice(chars) for _ in range(32))
|
||||
32
app/core/algorithms/signature.py
Normal file
32
app/core/algorithms/signature.py
Normal file
@ -0,0 +1,32 @@
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
from typing import Dict, Any
|
||||
|
||||
class SignatureGenerator:
|
||||
"""签名生成器"""
|
||||
|
||||
def __init__(self, sign_key: str):
|
||||
self.sign_key = sign_key
|
||||
|
||||
def hmac_sha256(self, message: str) -> str:
|
||||
"""使用HMAC-SHA256算法生成签名"""
|
||||
key_bytes = self.sign_key.encode('utf-8')
|
||||
message_bytes = message.encode('utf-8')
|
||||
return hmac.new(key_bytes, message_bytes, hashlib.sha256).hexdigest()
|
||||
|
||||
def generate_signature(self, url_path: str, data: Dict[str, Any] = None) -> str:
|
||||
"""生成请求签名"""
|
||||
sign_message = url_path
|
||||
|
||||
if data:
|
||||
# 将布尔值转换为字符串
|
||||
data_copy = data.copy()
|
||||
for key, value in data_copy.items():
|
||||
if isinstance(value, bool):
|
||||
data_copy[key] = "true" if value else "false"
|
||||
|
||||
# 添加请求体到签名消息
|
||||
sign_message += json.dumps(data_copy, ensure_ascii=False, separators=(',', ':'))
|
||||
|
||||
return self.hmac_sha256(sign_message)
|
||||
31
app/core/bgtask.py
Normal file
31
app/core/bgtask.py
Normal file
@ -0,0 +1,31 @@
|
||||
from starlette.background import BackgroundTasks
|
||||
|
||||
from .ctx import CTX_BG_TASKS
|
||||
|
||||
|
||||
class BgTasks:
|
||||
"""后台任务统一管理"""
|
||||
|
||||
@classmethod
|
||||
async def init_bg_tasks_obj(cls):
|
||||
"""实例化后台任务,并设置到上下文"""
|
||||
bg_tasks = BackgroundTasks()
|
||||
CTX_BG_TASKS.set(bg_tasks)
|
||||
|
||||
@classmethod
|
||||
async def get_bg_tasks_obj(cls):
|
||||
"""从上下文中获取后台任务实例"""
|
||||
return CTX_BG_TASKS.get()
|
||||
|
||||
@classmethod
|
||||
async def add_task(cls, func, *args, **kwargs):
|
||||
"""添加后台任务"""
|
||||
bg_tasks = await cls.get_bg_tasks_obj()
|
||||
bg_tasks.add_task(func, *args, **kwargs)
|
||||
|
||||
@classmethod
|
||||
async def execute_tasks(cls):
|
||||
"""执行后台任务,一般是请求结果返回之后执行"""
|
||||
bg_tasks = await cls.get_bg_tasks_obj()
|
||||
if bg_tasks.tasks:
|
||||
await bg_tasks()
|
||||
56
app/core/clickhouse.py
Normal file
56
app/core/clickhouse.py
Normal file
@ -0,0 +1,56 @@
|
||||
from clickhouse_connect import get_async_client
|
||||
from clickhouse_connect.driver import AsyncClient as AsyncClickHouseClient
|
||||
from app.settings.config import settings
|
||||
import urllib3
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
async def get_clickhouse_client() -> AsyncClickHouseClient:
|
||||
"""获取ClickHouse异步客户端"""
|
||||
# 创建自定义连接池管理器,适配多worker模式
|
||||
# 考虑到多worker环境,每个worker的连接池应该适中
|
||||
# maxsize参数控制每个主机的连接池大小,block=True防止连接池溢出
|
||||
pool_mgr = urllib3.PoolManager(
|
||||
num_pools=2, # 连接池数量,减少以适配多worker
|
||||
maxsize=5, # 每个连接池的最大连接数,每个worker最多10个连接
|
||||
block=True # 当连接池满时阻塞而不是创建新连接
|
||||
)
|
||||
|
||||
return await get_async_client(
|
||||
host=settings.CLICKHOUSE_HOST,
|
||||
username=settings.CLICKHOUSE_USER,
|
||||
password=settings.CLICKHOUSE_PASS,
|
||||
database=settings.CLICKHOUSE_DB,
|
||||
port=settings.CLICKHOUSE_PORT,
|
||||
pool_mgr=pool_mgr,
|
||||
connect_timeout=30,
|
||||
send_receive_timeout=120
|
||||
)
|
||||
|
||||
|
||||
class ClickHouseManager:
|
||||
"""ClickHouse连接管理器"""
|
||||
|
||||
def __init__(self):
|
||||
self._client: AsyncClickHouseClient = None
|
||||
|
||||
async def get_client(self) -> AsyncClickHouseClient:
|
||||
"""获取或创建ClickHouse客户端"""
|
||||
if self._client is None:
|
||||
self._client = await get_clickhouse_client()
|
||||
return self._client
|
||||
|
||||
async def execute(self, query: str, parameters: Dict[str, Any] = None):
|
||||
"""执行SQL查询"""
|
||||
client = await self.get_client()
|
||||
return await client.query(query, parameters=parameters)
|
||||
|
||||
async def close(self):
|
||||
"""关闭ClickHouse连接"""
|
||||
if self._client:
|
||||
await self._client.close()
|
||||
self._client = None
|
||||
|
||||
|
||||
# 全局ClickHouse管理器实例
|
||||
clickhouse_manager = ClickHouseManager()
|
||||
240
app/core/clickhouse_init.py
Normal file
240
app/core/clickhouse_init.py
Normal file
@ -0,0 +1,240 @@
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
from app.log import logger
|
||||
|
||||
|
||||
class ClickHouseInitializer:
|
||||
"""ClickHouse数据库初始化器"""
|
||||
|
||||
def __init__(self, client: AsyncClient):
|
||||
self.client = client
|
||||
|
||||
async def create_boss_job_json_table(self):
|
||||
"""创建BOSS招聘职位JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.boss_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
job_id String DEFAULT '', -- BOSS平台去重字段:jobBaseInfoVO.jobId
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("BOSS职位JSON数据表 boss_job 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建BOSS职位JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_boss_company_json_table(self):
|
||||
"""创建BOSS招聘公司JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.boss_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
company_name String DEFAULT '', -- 公司名称去重字段
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("BOSS公司JSON数据表 boss_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建BOSS公司JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_qcwy_job_json_table(self):
|
||||
"""创建前程无忧职位JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.qcwy_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
job_id String DEFAULT '', -- QCWY平台去重字段:jobId
|
||||
update_date_time String DEFAULT '', -- QCWY平台去重字段:updateDateTime
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("前程无忧职位JSON数据表 qcwy_job 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建前程无忧职位JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_qcwy_company_json_table(self):
|
||||
"""创建前程无忧公司JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.qcwy_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
company_name String DEFAULT '', -- 公司名称去重字段
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("前程无忧公司JSON数据表 qcwy_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建前程无忧公司JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_zhilian_job_json_table(self):
|
||||
"""创建智联招聘职位JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.zhilian_job (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
number String DEFAULT '', -- 智联平台去重字段:number
|
||||
first_publish_time String DEFAULT '', -- 智联平台去重字段:firstPublishTime
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("智联招聘职位JSON数据表 zhilian_job 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建智联招聘职位JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_zhilian_company_json_table(self):
|
||||
"""创建智联招聘公司JSON存储表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.zhilian_company (
|
||||
id UInt64 DEFAULT 0,
|
||||
json_data String DEFAULT '', -- 原始JSON数据
|
||||
company_name String DEFAULT '', -- 公司名称去重字段
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now()
|
||||
) ENGINE = MergeTree()
|
||||
ORDER BY created_at
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("智联招聘公司JSON数据表 zhilian_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建智联招聘公司JSON数据表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_pending_company_table(self):
|
||||
"""创建待处理公司表"""
|
||||
create_table_sql = """
|
||||
CREATE TABLE IF NOT EXISTS job_data.pending_company (
|
||||
source String,
|
||||
company_id String,
|
||||
company_name String DEFAULT '',
|
||||
status String DEFAULT 'pending',
|
||||
error_msg String DEFAULT '',
|
||||
created_at DateTime DEFAULT now(),
|
||||
updated_at DateTime DEFAULT now(),
|
||||
version UInt64 DEFAULT 1
|
||||
) ENGINE = ReplacingMergeTree(version)
|
||||
ORDER BY (source, company_id)
|
||||
SETTINGS index_granularity = 8192;
|
||||
"""
|
||||
|
||||
try:
|
||||
await self.client.command(create_table_sql)
|
||||
logger.info("待处理公司表 pending_company 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建待处理公司表失败: {e}")
|
||||
raise
|
||||
|
||||
async def create_job_analytics_view(self):
|
||||
"""创建统一的招聘数据分析视图"""
|
||||
create_view_sql = """
|
||||
CREATE VIEW IF NOT EXISTS job_data.job_analytics AS
|
||||
SELECT
|
||||
'boss' as source,
|
||||
job_id,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'brandName') as company_name,
|
||||
JSONExtractString(json_data, 'salaryDesc') as salary_text,
|
||||
0.0 as salary_min,
|
||||
0.0 as salary_max,
|
||||
JSONExtractString(json_data, 'cityName') as city,
|
||||
JSONExtractString(json_data, 'experienceName') as experience_required,
|
||||
JSONExtractString(json_data, 'degreeName') as education,
|
||||
created_at
|
||||
FROM job_data.boss_job
|
||||
UNION ALL
|
||||
SELECT
|
||||
'qcwy' as source,
|
||||
job_id,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'companyName') as company_name,
|
||||
JSONExtractString(json_data, 'provideSalaryString') as salary_text,
|
||||
0.0, 0.0,
|
||||
JSONExtractString(json_data, 'workCity') as city,
|
||||
JSONExtractString(json_data, 'workYear') as experience_required,
|
||||
JSONExtractString(json_data, 'degree') as education,
|
||||
created_at
|
||||
FROM job_data.qcwy_job
|
||||
UNION ALL
|
||||
SELECT
|
||||
'zhilian' as source,
|
||||
number as job_id,
|
||||
JSONExtractString(json_data, 'jobName') as position_name,
|
||||
JSONExtractString(json_data, 'companyName') as company_name,
|
||||
JSONExtractString(json_data, 'salary60') as salary_text,
|
||||
0.0, 0.0,
|
||||
JSONExtractString(json_data, 'workCity') as city,
|
||||
JSONExtractString(json_data, 'workingExp') as experience_required,
|
||||
JSONExtractString(json_data, 'education') as education,
|
||||
created_at
|
||||
FROM job_data.zhilian_job
|
||||
"""
|
||||
try:
|
||||
await self.client.command(create_view_sql)
|
||||
logger.info("招聘数据分析视图 job_analytics 创建成功")
|
||||
except Exception as e:
|
||||
logger.error(f"创建招聘数据分析视图失败: {e}")
|
||||
raise
|
||||
|
||||
async def initialize_all_tables(self):
|
||||
"""初始化所有表"""
|
||||
logger.info("开始初始化 ClickHouse 数据库表...")
|
||||
|
||||
try:
|
||||
# 创建BOSS招聘JSON表
|
||||
await self.create_boss_job_json_table()
|
||||
await self.create_boss_company_json_table()
|
||||
|
||||
# 创建前程无忧JSON表
|
||||
await self.create_qcwy_job_json_table()
|
||||
await self.create_qcwy_company_json_table()
|
||||
|
||||
# 创建智联招聘JSON表
|
||||
await self.create_zhilian_job_json_table()
|
||||
await self.create_zhilian_company_json_table()
|
||||
|
||||
# 创建待处理公司表
|
||||
await self.create_pending_company_table()
|
||||
|
||||
# 创建统一分析视图
|
||||
await self.create_job_analytics_view()
|
||||
|
||||
logger.info("ClickHouse 数据库表初始化完成")
|
||||
except Exception as e:
|
||||
logger.error(f"ClickHouse 数据库初始化失败: {e}")
|
||||
raise
|
||||
49
app/core/crud.py
Normal file
49
app/core/crud.py
Normal file
@ -0,0 +1,49 @@
|
||||
from typing import Any, Dict, Generic, List, NewType, Tuple, Type, TypeVar, Union
|
||||
|
||||
from pydantic import BaseModel
|
||||
from tortoise.expressions import Q
|
||||
from tortoise.models import Model
|
||||
|
||||
Total = NewType("Total", int)
|
||||
ModelType = TypeVar("ModelType", bound=Model)
|
||||
CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel)
|
||||
UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel)
|
||||
|
||||
|
||||
class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]):
|
||||
def __init__(self, model: Type[ModelType]):
|
||||
self.model = model
|
||||
|
||||
async def get(self, id: int) -> ModelType:
|
||||
return await self.model.get(id=id)
|
||||
|
||||
async def list(self, page: int, page_size: int, search: Q = Q(), order: list = []) -> Tuple[Total, List[ModelType]]:
|
||||
query = self.model.filter(search)
|
||||
return await query.count(), await query.offset((page - 1) * page_size).limit(page_size).order_by(*order)
|
||||
|
||||
async def create(self, obj_in: CreateSchemaType) -> ModelType:
|
||||
if isinstance(obj_in, Dict):
|
||||
obj_dict = obj_in
|
||||
else:
|
||||
obj_dict = obj_in.model_dump()
|
||||
obj = self.model(**obj_dict)
|
||||
await obj.save()
|
||||
return obj
|
||||
|
||||
async def update(self, id: int, obj_in: Union[UpdateSchemaType, Dict[str, Any]]) -> ModelType:
|
||||
if isinstance(obj_in, Dict):
|
||||
obj_dict = obj_in
|
||||
else:
|
||||
obj_dict = obj_in.model_dump(exclude_unset=True, exclude={"id"})
|
||||
obj = await self.get(id=id)
|
||||
obj = obj.update_from_dict(obj_dict)
|
||||
await obj.save()
|
||||
return obj
|
||||
|
||||
async def remove(self, id: int) -> None:
|
||||
obj = await self.get(id=id)
|
||||
await obj.delete()
|
||||
|
||||
# 加一个 filter_one 方法,用于根据条件查询单条数据
|
||||
async def filter_one(self, **kwargs) -> ModelType:
|
||||
return await self.model.filter(**kwargs).first()
|
||||
6
app/core/ctx.py
Normal file
6
app/core/ctx.py
Normal file
@ -0,0 +1,6 @@
|
||||
import contextvars
|
||||
|
||||
from starlette.background import BackgroundTasks
|
||||
|
||||
CTX_USER_ID: contextvars.ContextVar[int] = contextvars.ContextVar("user_id", default=0)
|
||||
CTX_BG_TASKS: contextvars.ContextVar[BackgroundTasks] = contextvars.ContextVar("bg_task", default=None)
|
||||
64
app/core/dependency.py
Normal file
64
app/core/dependency.py
Normal file
@ -0,0 +1,64 @@
|
||||
from typing import Optional, Dict, Any
|
||||
import jwt
|
||||
from fastapi import Depends, Header, HTTPException, Request
|
||||
|
||||
from app.core.ctx import CTX_USER_ID
|
||||
from app.models import Role, User
|
||||
from app.settings import settings
|
||||
|
||||
|
||||
def get_list_params(skip: int = 0, limit: int = 10, filters: Dict[str, Any] = None, sort_by: str = None, sort_order: str = "desc"):
|
||||
"""获取列表查询参数"""
|
||||
from app.core.crud import ListParams
|
||||
return ListParams(
|
||||
skip=skip,
|
||||
limit=limit,
|
||||
filters=filters or {},
|
||||
sort_by=sort_by,
|
||||
sort_order=sort_order
|
||||
)
|
||||
|
||||
|
||||
class AuthControl:
|
||||
@classmethod
|
||||
async def is_authed(cls, token: str = Header(..., description="token验证")) -> Optional["User"]:
|
||||
try:
|
||||
if token == "dev":
|
||||
user = await User.filter().first()
|
||||
user_id = user.id
|
||||
else:
|
||||
decode_data = jwt.decode(token, settings.SECRET_KEY, algorithms=settings.JWT_ALGORITHM)
|
||||
user_id = decode_data.get("user_id")
|
||||
user = await User.filter(id=user_id).first()
|
||||
if not user:
|
||||
raise HTTPException(status_code=401, detail="Authentication failed")
|
||||
CTX_USER_ID.set(int(user_id))
|
||||
return user
|
||||
except jwt.DecodeError:
|
||||
raise HTTPException(status_code=401, detail="无效的Token")
|
||||
except jwt.ExpiredSignatureError:
|
||||
raise HTTPException(status_code=401, detail="登录已过期")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"{repr(e)}")
|
||||
|
||||
|
||||
class PermissionControl:
|
||||
@classmethod
|
||||
async def has_permission(cls, request: Request, current_user: User = Depends(AuthControl.is_authed)) -> None:
|
||||
if current_user.is_superuser:
|
||||
return
|
||||
method = request.method
|
||||
path = request.url.path
|
||||
roles: list[Role] = await current_user.roles
|
||||
if not roles:
|
||||
raise HTTPException(status_code=403, detail="The user is not bound to a role")
|
||||
apis = [await role.apis for role in roles]
|
||||
permission_apis = list(set((api.method, api.path) for api in sum(apis, [])))
|
||||
# path = "/api/v1/auth/userinfo"
|
||||
# method = "GET"
|
||||
if (method, path) not in permission_apis:
|
||||
raise HTTPException(status_code=403, detail=f"Permission denied method:{method} path:{path}")
|
||||
|
||||
|
||||
DependAuth = Depends(AuthControl.is_authed)
|
||||
DependPermission = Depends(PermissionControl.has_permission)
|
||||
55
app/core/exceptions.py
Normal file
55
app/core/exceptions.py
Normal file
@ -0,0 +1,55 @@
|
||||
from fastapi.exceptions import (
|
||||
HTTPException,
|
||||
RequestValidationError,
|
||||
ResponseValidationError,
|
||||
)
|
||||
from fastapi.requests import Request
|
||||
from fastapi.responses import JSONResponse
|
||||
from tortoise.exceptions import DoesNotExist, IntegrityError
|
||||
from app.log import logger
|
||||
|
||||
|
||||
class SettingNotFound(Exception):
|
||||
pass
|
||||
|
||||
|
||||
async def DoesNotExistHandle(req: Request, exc: DoesNotExist) -> JSONResponse:
|
||||
content = dict(
|
||||
code=404,
|
||||
msg=f"Object has not found, exc: {exc}, query_params: {req.query_params}",
|
||||
)
|
||||
return JSONResponse(content=content, status_code=404)
|
||||
|
||||
|
||||
async def IntegrityHandle(_: Request, exc: IntegrityError) -> JSONResponse:
|
||||
content = dict(
|
||||
code=500,
|
||||
msg=f"IntegrityError,{exc}",
|
||||
)
|
||||
return JSONResponse(content=content, status_code=500)
|
||||
|
||||
|
||||
async def HttpExcHandle(_: Request, exc: HTTPException) -> JSONResponse:
|
||||
content = dict(code=exc.status_code, msg=exc.detail, data=None)
|
||||
return JSONResponse(content=content, status_code=exc.status_code)
|
||||
|
||||
|
||||
async def RequestValidationHandle(req: Request, exc: RequestValidationError) -> JSONResponse:
|
||||
try:
|
||||
body_bytes = await req.body()
|
||||
body_text = body_bytes.decode("utf-8", errors="replace")
|
||||
if len(body_text) > 10000:
|
||||
body_text = body_text[:10000] + "..."
|
||||
logger.error(
|
||||
f"422 RequestValidationError path={req.url.path} errors={exc.errors()} body={body_text}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"422 RequestValidationError logging failed: {e}")
|
||||
|
||||
content = dict(code=422, msg=f"RequestValidationError, {exc}")
|
||||
return JSONResponse(content=content, status_code=422)
|
||||
|
||||
|
||||
async def ResponseValidationHandle(_: Request, exc: ResponseValidationError) -> JSONResponse:
|
||||
content = dict(code=500, msg=f"ResponseValidationError, {exc}")
|
||||
return JSONResponse(content=content, status_code=500)
|
||||
351
app/core/init_app.py
Normal file
351
app/core/init_app.py
Normal file
@ -0,0 +1,351 @@
|
||||
import shutil
|
||||
|
||||
from aerich import Command
|
||||
from fastapi import FastAPI
|
||||
from fastapi.middleware import Middleware
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from tortoise.expressions import Q
|
||||
|
||||
from app.api import api_router
|
||||
from app.controllers.api import api_controller
|
||||
from app.controllers.user import UserCreate, user_controller
|
||||
from app.core.exceptions import (
|
||||
DoesNotExist,
|
||||
DoesNotExistHandle,
|
||||
HTTPException,
|
||||
HttpExcHandle,
|
||||
IntegrityError,
|
||||
IntegrityHandle,
|
||||
RequestValidationError,
|
||||
RequestValidationHandle,
|
||||
ResponseValidationError,
|
||||
ResponseValidationHandle,
|
||||
)
|
||||
from app.log import logger
|
||||
from app.models.admin import Api, Menu, Role
|
||||
from app.schemas.menus import MenuType
|
||||
from app.settings.config import settings
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.core.clickhouse_init import ClickHouseInitializer
|
||||
|
||||
from .middlewares import BackGroundTaskMiddleware, HttpAuditLogMiddleware
|
||||
from .ip_tracking import IpTrackingMiddleware
|
||||
|
||||
|
||||
def make_middlewares():
|
||||
middleware = [
|
||||
Middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=settings.CORS_ORIGINS,
|
||||
allow_credentials=settings.CORS_ALLOW_CREDENTIALS,
|
||||
allow_methods=settings.CORS_ALLOW_METHODS,
|
||||
allow_headers=settings.CORS_ALLOW_HEADERS,
|
||||
),
|
||||
Middleware(BackGroundTaskMiddleware),
|
||||
Middleware(
|
||||
HttpAuditLogMiddleware,
|
||||
methods=["GET", "POST", "PUT", "DELETE"],
|
||||
exclude_paths=[
|
||||
"/api/v1/base/access_token",
|
||||
"/docs",
|
||||
"/openapi.json",
|
||||
],
|
||||
),
|
||||
Middleware(IpTrackingMiddleware),
|
||||
]
|
||||
return middleware
|
||||
|
||||
|
||||
def register_exceptions(app: FastAPI):
|
||||
app.add_exception_handler(DoesNotExist, DoesNotExistHandle)
|
||||
app.add_exception_handler(HTTPException, HttpExcHandle)
|
||||
app.add_exception_handler(IntegrityError, IntegrityHandle)
|
||||
app.add_exception_handler(RequestValidationError, RequestValidationHandle)
|
||||
app.add_exception_handler(ResponseValidationError, ResponseValidationHandle)
|
||||
|
||||
|
||||
def register_routers(app: FastAPI, prefix: str = "/api"):
|
||||
app.include_router(api_router, prefix=prefix)
|
||||
|
||||
|
||||
async def init_superuser():
|
||||
user = await user_controller.model.exists()
|
||||
if not user:
|
||||
await user_controller.create_user(
|
||||
UserCreate(
|
||||
username="admin",
|
||||
email="admin@admin.com",
|
||||
password="123456",
|
||||
is_active=True,
|
||||
is_superuser=True,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
async def init_menus():
|
||||
menus = await Menu.exists()
|
||||
if not menus:
|
||||
parent_menu = await Menu.create(
|
||||
menu_type=MenuType.CATALOG,
|
||||
name="系统管理",
|
||||
path="/system",
|
||||
order=1,
|
||||
parent_id=0,
|
||||
icon="carbon:gui-management",
|
||||
is_hidden=False,
|
||||
component="Layout",
|
||||
keepalive=False,
|
||||
redirect="/system/user",
|
||||
)
|
||||
children_menu = [
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="用户管理",
|
||||
path="user",
|
||||
order=1,
|
||||
parent_id=parent_menu.id,
|
||||
icon="material-symbols:person-outline-rounded",
|
||||
is_hidden=False,
|
||||
component="/system/user",
|
||||
keepalive=False,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="角色管理",
|
||||
path="role",
|
||||
order=2,
|
||||
parent_id=parent_menu.id,
|
||||
icon="carbon:user-role",
|
||||
is_hidden=False,
|
||||
component="/system/role",
|
||||
keepalive=False,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="菜单管理",
|
||||
path="menu",
|
||||
order=3,
|
||||
parent_id=parent_menu.id,
|
||||
icon="material-symbols:list-alt-outline",
|
||||
is_hidden=False,
|
||||
component="/system/menu",
|
||||
keepalive=False,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="API管理",
|
||||
path="api",
|
||||
order=4,
|
||||
parent_id=parent_menu.id,
|
||||
icon="ant-design:api-outlined",
|
||||
is_hidden=False,
|
||||
component="/system/api",
|
||||
keepalive=False,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="部门管理",
|
||||
path="dept",
|
||||
order=5,
|
||||
parent_id=parent_menu.id,
|
||||
icon="mingcute:department-line",
|
||||
is_hidden=False,
|
||||
component="/system/dept",
|
||||
keepalive=False,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="审计日志",
|
||||
path="auditlog",
|
||||
order=6,
|
||||
parent_id=parent_menu.id,
|
||||
icon="ph:clipboard-text-bold",
|
||||
is_hidden=False,
|
||||
component="/system/auditlog",
|
||||
keepalive=False,
|
||||
),
|
||||
]
|
||||
await Menu.bulk_create(children_menu)
|
||||
|
||||
# 创建招聘数据管理菜单
|
||||
recruitment_menu = await Menu.create(
|
||||
menu_type=MenuType.CATALOG,
|
||||
name="招聘数据管理",
|
||||
path="/recruitment",
|
||||
order=2,
|
||||
parent_id=0,
|
||||
icon="mdi:briefcase-search",
|
||||
is_hidden=False,
|
||||
component="Layout",
|
||||
keepalive=False,
|
||||
redirect="/recruitment/qcwy",
|
||||
)
|
||||
recruitment_children = [
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="前程无忧",
|
||||
path="qcwy",
|
||||
order=1,
|
||||
parent_id=recruitment_menu.id,
|
||||
icon="mdi:alpha-q-box",
|
||||
is_hidden=False,
|
||||
component="/recruitment/qcwy",
|
||||
keepalive=True,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="智联招聘",
|
||||
path="zhilian",
|
||||
order=2,
|
||||
parent_id=recruitment_menu.id,
|
||||
icon="mdi:alpha-z-box",
|
||||
is_hidden=False,
|
||||
component="/recruitment/zhilian",
|
||||
keepalive=True,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="Boss直聘",
|
||||
path="boss",
|
||||
order=3,
|
||||
parent_id=recruitment_menu.id,
|
||||
icon="mdi:alpha-b-box",
|
||||
is_hidden=False,
|
||||
component="/recruitment/boss",
|
||||
keepalive=True,
|
||||
),
|
||||
]
|
||||
await Menu.bulk_create(recruitment_children)
|
||||
|
||||
# 创建数据清理菜单
|
||||
cleaning_menu = await Menu.create(
|
||||
menu_type=MenuType.CATALOG,
|
||||
name="数据清理",
|
||||
path="/cleaning",
|
||||
order=3,
|
||||
parent_id=0,
|
||||
icon="mdi:database-refresh",
|
||||
is_hidden=False,
|
||||
component="Layout",
|
||||
keepalive=False,
|
||||
redirect="/cleaning/targeted",
|
||||
)
|
||||
cleaning_children = [
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="定向数据",
|
||||
path="targeted",
|
||||
order=1,
|
||||
parent_id=cleaning_menu.id,
|
||||
icon="mdi:filter-target",
|
||||
is_hidden=False,
|
||||
component="/cleaning/index",
|
||||
keepalive=True,
|
||||
),
|
||||
Menu(
|
||||
menu_type=MenuType.MENU,
|
||||
name="清洗监控",
|
||||
path="monitor",
|
||||
order=2,
|
||||
parent_id=cleaning_menu.id,
|
||||
icon="mdi:monitor-dashboard",
|
||||
is_hidden=False,
|
||||
component="/cleaning/monitor",
|
||||
keepalive=True,
|
||||
),
|
||||
]
|
||||
await Menu.bulk_create(cleaning_children)
|
||||
|
||||
|
||||
async def init_apis():
|
||||
apis = await api_controller.model.exists()
|
||||
if not apis:
|
||||
await api_controller.refresh_api()
|
||||
|
||||
|
||||
async def init_db():
|
||||
"""执行数据库迁移(受环境开关与并发保护控制)"""
|
||||
command = Command(tortoise_config=settings.TORTOISE_ORM)
|
||||
await command.init_db(safe=True)
|
||||
await command.init()
|
||||
try:
|
||||
await command.migrate()
|
||||
except AttributeError:
|
||||
logger.warning("unable to retrieve model history from database, model history will be created from scratch")
|
||||
shutil.rmtree("migrations")
|
||||
await command.init_db(safe=True)
|
||||
await command.upgrade(run_in_transaction=True)
|
||||
|
||||
|
||||
async def init_roles():
|
||||
roles = await Role.exists()
|
||||
if not roles:
|
||||
admin_role = await Role.create(
|
||||
name="管理员",
|
||||
desc="管理员角色",
|
||||
)
|
||||
user_role = await Role.create(
|
||||
name="普通用户",
|
||||
desc="普通用户角色",
|
||||
)
|
||||
|
||||
# 分配所有API给管理员角色
|
||||
all_apis = await Api.all()
|
||||
await admin_role.apis.add(*all_apis)
|
||||
# 分配所有菜单给管理员和普通用户
|
||||
all_menus = await Menu.all()
|
||||
await admin_role.menus.add(*all_menus)
|
||||
await user_role.menus.add(*all_menus)
|
||||
|
||||
# 为普通用户分配基本API
|
||||
basic_apis = await Api.filter(Q(method__in=["GET"]) | Q(tags="基础模块"))
|
||||
await user_role.apis.add(*basic_apis)
|
||||
|
||||
|
||||
async def init_clickhouse():
|
||||
"""初始化ClickHouse数据库(若未配置则跳过)"""
|
||||
host = settings.CLICKHOUSE_HOST or ""
|
||||
if not host:
|
||||
return
|
||||
try:
|
||||
client = await clickhouse_manager.get_client()
|
||||
initializer = ClickHouseInitializer(client)
|
||||
await initializer.initialize_all_tables()
|
||||
logger.info("ClickHouse初始化完成")
|
||||
except Exception as e:
|
||||
logger.error(f"ClickHouse初始化失败: {e}")
|
||||
|
||||
|
||||
async def init_data():
|
||||
"""应用启动数据初始化:受环境变量控制并在多进程下只执行一次"""
|
||||
should_migrate = settings.RUN_MIGRATIONS_ON_STARTUP
|
||||
should_seed = settings.INITIALIZE_SEED_DATA_ON_STARTUP
|
||||
|
||||
lock_dir = ".startup_lock"
|
||||
acquired = False
|
||||
try:
|
||||
# 简单文件锁,避免多 worker 并发执行
|
||||
import os
|
||||
os.mkdir(lock_dir)
|
||||
acquired = True
|
||||
except Exception:
|
||||
acquired = False
|
||||
|
||||
if should_migrate and acquired:
|
||||
await init_db()
|
||||
|
||||
if should_seed and acquired:
|
||||
await init_superuser()
|
||||
await init_menus()
|
||||
await init_apis()
|
||||
await init_roles()
|
||||
|
||||
# ClickHouse 初始化为可选,且不影响主应用
|
||||
await init_clickhouse()
|
||||
|
||||
if acquired:
|
||||
try:
|
||||
import os
|
||||
os.rmdir(lock_dir)
|
||||
except Exception:
|
||||
pass
|
||||
82
app/core/ip_tracking.py
Normal file
82
app/core/ip_tracking.py
Normal file
@ -0,0 +1,82 @@
|
||||
from datetime import datetime
|
||||
from typing import Any
|
||||
|
||||
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
||||
from starlette.requests import Request
|
||||
from starlette.responses import Response
|
||||
|
||||
from app.models.metrics import IpUploadStats
|
||||
|
||||
|
||||
class IpTrackingMiddleware(BaseHTTPMiddleware):
|
||||
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
||||
try:
|
||||
response = await call_next(request)
|
||||
except Exception as e:
|
||||
# Let other middleware or exception handlers handle it
|
||||
raise e
|
||||
|
||||
try:
|
||||
path = request.url.path
|
||||
if path.startswith("/api/v1/universal/data") or path.startswith("/api/v1/boss/") or path.startswith("/api/v1/qcwy/") or path.startswith("/api/v1/zhilian/"):
|
||||
args = getattr(request.state, "request_args", {})
|
||||
source = args.get("platform") or (
|
||||
"boss" if path.startswith("/api/v1/boss/") else "qcwy" if path.startswith("/api/v1/qcwy/") else "zhilian" if path.startswith("/api/v1/zhilian/") else ""
|
||||
)
|
||||
ip = self._extract_ip(request)
|
||||
count = self._estimate_count(args, response)
|
||||
if source and ip and count:
|
||||
await self._update_stats(source, ip, count)
|
||||
except Exception:
|
||||
pass
|
||||
return response
|
||||
|
||||
def _extract_ip(self, request: Request) -> str:
|
||||
xfwd = request.headers.get("x-forwarded-for") or request.headers.get("X-Forwarded-For")
|
||||
if xfwd:
|
||||
return xfwd.split(",")[0].strip()
|
||||
xreal = request.headers.get("x-real-ip") or request.headers.get("X-Real-IP")
|
||||
if xreal:
|
||||
return xreal.strip()
|
||||
return request.client.host if request.client else ""
|
||||
|
||||
def _estimate_count(self, args: dict, response: Response) -> int:
|
||||
try:
|
||||
# 同步接口:从响应体的数据段读取成功数量
|
||||
if hasattr(response, "body") and response.body:
|
||||
import json
|
||||
data = json.loads(response.body)
|
||||
if isinstance(data, dict) and isinstance(data.get("data"), dict):
|
||||
d = data["data"]
|
||||
if "success" in d:
|
||||
return int(d.get("success", 0))
|
||||
# 异步接口或无详细响应:按请求体估算
|
||||
if "data_list" in args and isinstance(args.get("data_list"), list):
|
||||
return len(args.get("data_list"))
|
||||
if "data" in args:
|
||||
return 1
|
||||
except Exception:
|
||||
pass
|
||||
return 0
|
||||
|
||||
async def _update_stats(self, source: str, ip: str, inc: int) -> None:
|
||||
from datetime import timezone
|
||||
# 使用timezone-aware datetime,确保与数据库中的datetime类型一致
|
||||
now = datetime.now(timezone.utc)
|
||||
today = now.date()
|
||||
obj = await IpUploadStats.get_or_none(source=source, ip=ip, date=today)
|
||||
if obj:
|
||||
obj.upload_count = obj.upload_count + inc
|
||||
obj.last_report_at = now
|
||||
if getattr(obj, "status", "normal") != "normal":
|
||||
obj.status = "normal"
|
||||
await obj.save()
|
||||
else:
|
||||
await IpUploadStats.create(
|
||||
source=source,
|
||||
ip=ip,
|
||||
date=today,
|
||||
upload_count=inc,
|
||||
last_report_at=now,
|
||||
status="normal",
|
||||
)
|
||||
75
app/core/locks.py
Normal file
75
app/core/locks.py
Normal file
@ -0,0 +1,75 @@
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
|
||||
class DistributedLock:
|
||||
"""分布式锁封装,优先使用 Redis,不可用时降级为文件锁"""
|
||||
|
||||
def __init__(self, name: str, ttl_seconds: int = 600):
|
||||
self.name = name
|
||||
self.ttl = ttl_seconds
|
||||
self.token = str(uuid.uuid4())
|
||||
self._use_redis = False
|
||||
self._redis = None
|
||||
self._file_path = f".lock_{self.name}"
|
||||
try:
|
||||
import redis # type: ignore
|
||||
from app.settings.config import settings
|
||||
self._redis = redis.Redis(
|
||||
host=getattr(settings, "REDIS_HOST", None) or "",
|
||||
port=getattr(settings, "REDIS_PORT", 6379),
|
||||
db=getattr(settings, "REDIS_DB", 0),
|
||||
password=getattr(settings, "REDIS_PASS", None) or None,
|
||||
socket_timeout=3,
|
||||
)
|
||||
# 尝试 ping
|
||||
if self._redis.ping():
|
||||
self._use_redis = True
|
||||
except Exception:
|
||||
self._use_redis = False
|
||||
|
||||
async def acquire(self) -> bool:
|
||||
"""获取锁,返回是否成功"""
|
||||
if self._use_redis and self._redis is not None:
|
||||
try:
|
||||
# NX+EX 设置锁,避免竞争
|
||||
return bool(self._redis.set(f"lock:{self.name}", self.token, nx=True, ex=self.ttl))
|
||||
except Exception:
|
||||
pass
|
||||
# 文件锁降级(单机安全)
|
||||
try:
|
||||
os.mkdir(self._file_path)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def release(self) -> None:
|
||||
"""释放锁"""
|
||||
if self._use_redis and self._redis is not None:
|
||||
try:
|
||||
# 简单释放;生产建议使用 Lua 脚本确保原子性
|
||||
key = f"lock:{self.name}"
|
||||
val = self._redis.get(key)
|
||||
if val and val.decode() == self.token:
|
||||
self._redis.delete(key)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
os.rmdir(self._file_path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@asynccontextmanager
|
||||
async def context(self):
|
||||
"""上下文管理:获取成功才进入"""
|
||||
acquired = await self.acquire()
|
||||
try:
|
||||
if acquired:
|
||||
yield True
|
||||
else:
|
||||
yield False
|
||||
finally:
|
||||
if acquired:
|
||||
await self.release()
|
||||
182
app/core/middlewares.py
Normal file
182
app/core/middlewares.py
Normal file
@ -0,0 +1,182 @@
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime
|
||||
from typing import Any, AsyncGenerator
|
||||
|
||||
from fastapi import FastAPI
|
||||
from fastapi.responses import Response
|
||||
from fastapi.routing import APIRoute
|
||||
from starlette.middleware.base import BaseHTTPMiddleware, RequestResponseEndpoint
|
||||
from starlette.requests import Request
|
||||
from starlette.types import ASGIApp, Receive, Scope, Send
|
||||
|
||||
from app.core.dependency import AuthControl
|
||||
from app.models.admin import AuditLog, User
|
||||
|
||||
from .bgtask import BgTasks
|
||||
|
||||
|
||||
class SimpleBaseMiddleware:
|
||||
def __init__(self, app: ASGIApp) -> None:
|
||||
self.app = app
|
||||
|
||||
async def __call__(self, scope: Scope, receive: Receive, send: Send) -> None:
|
||||
if scope["type"] != "http":
|
||||
await self.app(scope, receive, send)
|
||||
return
|
||||
|
||||
request = Request(scope, receive=receive)
|
||||
|
||||
response = await self.before_request(request) or self.app
|
||||
await response(request.scope, request.receive, send)
|
||||
await self.after_request(request)
|
||||
|
||||
async def before_request(self, request: Request):
|
||||
return self.app
|
||||
|
||||
async def after_request(self, request: Request):
|
||||
return None
|
||||
|
||||
|
||||
class BackGroundTaskMiddleware(SimpleBaseMiddleware):
|
||||
async def before_request(self, request):
|
||||
await BgTasks.init_bg_tasks_obj()
|
||||
|
||||
async def after_request(self, request):
|
||||
await BgTasks.execute_tasks()
|
||||
|
||||
|
||||
class HttpAuditLogMiddleware(BaseHTTPMiddleware):
|
||||
def __init__(self, app, methods: list[str], exclude_paths: list[str]):
|
||||
super().__init__(app)
|
||||
self.methods = methods
|
||||
self.exclude_paths = exclude_paths
|
||||
self.audit_log_paths = ["/api/v1/auditlog/list"]
|
||||
self.max_body_size = 1024 * 1024 # 1MB 响应体大小限制
|
||||
|
||||
async def get_request_args(self, request: Request) -> dict:
|
||||
args = {}
|
||||
# 获取查询参数
|
||||
for key, value in request.query_params.items():
|
||||
args[key] = value
|
||||
|
||||
# 获取请求体
|
||||
if request.method in ["POST", "PUT", "PATCH"]:
|
||||
try:
|
||||
body = await request.json()
|
||||
args.update(body)
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
body = await request.form()
|
||||
# args.update(body)
|
||||
for k, v in body.items():
|
||||
if hasattr(v, "filename"): # 文件上传行为
|
||||
args[k] = v.filename
|
||||
elif isinstance(v, list) and v and hasattr(v[0], "filename"):
|
||||
args[k] = [file.filename for file in v]
|
||||
else:
|
||||
args[k] = v
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return args
|
||||
|
||||
async def get_response_body(self, request: Request, response: Response) -> Any:
|
||||
# 检查Content-Length
|
||||
content_length = response.headers.get("content-length")
|
||||
if content_length and int(content_length) > self.max_body_size:
|
||||
return {"code": 0, "msg": "Response too large to log", "data": None}
|
||||
|
||||
if hasattr(response, "body"):
|
||||
body = response.body
|
||||
else:
|
||||
body_chunks = []
|
||||
async for chunk in response.body_iterator:
|
||||
if not isinstance(chunk, bytes):
|
||||
chunk = chunk.encode(response.charset)
|
||||
body_chunks.append(chunk)
|
||||
|
||||
response.body_iterator = self._async_iter(body_chunks)
|
||||
body = b"".join(body_chunks)
|
||||
|
||||
if any(request.url.path.startswith(path) for path in self.audit_log_paths):
|
||||
try:
|
||||
data = self.lenient_json(body)
|
||||
# 只保留基本信息,去除详细的响应内容
|
||||
if isinstance(data, dict):
|
||||
data.pop("response_body", None)
|
||||
if "data" in data and isinstance(data["data"], list):
|
||||
for item in data["data"]:
|
||||
item.pop("response_body", None)
|
||||
return data
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
return self.lenient_json(body)
|
||||
|
||||
def lenient_json(self, v: Any) -> Any:
|
||||
if isinstance(v, (str, bytes)):
|
||||
try:
|
||||
return json.loads(v)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return v
|
||||
|
||||
async def _async_iter(self, items: list[bytes]) -> AsyncGenerator[bytes, None]:
|
||||
for item in items:
|
||||
yield item
|
||||
|
||||
async def get_request_log(self, request: Request, response: Response) -> dict:
|
||||
"""
|
||||
根据request和response对象获取对应的日志记录数据
|
||||
"""
|
||||
data: dict = {"path": request.url.path, "status": response.status_code, "method": request.method, "summary": "", "module": ""}
|
||||
# 路由信息
|
||||
app: FastAPI = request.app
|
||||
for route in app.routes:
|
||||
if (
|
||||
isinstance(route, APIRoute)
|
||||
and route.path_regex.match(request.url.path)
|
||||
and request.method in route.methods
|
||||
):
|
||||
data["module"] = ",".join(route.tags) if route.tags else ""
|
||||
data["summary"] = route.summary or ""
|
||||
# 获取用户信息
|
||||
try:
|
||||
token = request.headers.get("token")
|
||||
user_obj = None
|
||||
if token:
|
||||
user_obj: User = await AuthControl.is_authed(token)
|
||||
data["user_id"] = user_obj.id if user_obj else 0
|
||||
data["username"] = user_obj.username if user_obj else ""
|
||||
except Exception:
|
||||
data["user_id"] = 0
|
||||
data["username"] = ""
|
||||
return data
|
||||
|
||||
async def before_request(self, request: Request):
|
||||
request_args = await self.get_request_args(request)
|
||||
request.state.request_args = request_args
|
||||
|
||||
async def after_request(self, request: Request, response: Response, process_time: int):
|
||||
if request.method in self.methods:
|
||||
for path in self.exclude_paths:
|
||||
if re.search(path, request.url.path, re.I) is not None:
|
||||
return
|
||||
data: dict = await self.get_request_log(request=request, response=response)
|
||||
data["response_time"] = process_time
|
||||
|
||||
data["request_args"] = request.state.request_args
|
||||
data["response_body"] = await self.get_response_body(request, response)
|
||||
await AuditLog.create(**data)
|
||||
|
||||
return response
|
||||
|
||||
async def dispatch(self, request: Request, call_next: RequestResponseEndpoint) -> Response:
|
||||
start_time: datetime = datetime.now()
|
||||
await self.before_request(request)
|
||||
response = await call_next(request)
|
||||
end_time: datetime = datetime.now()
|
||||
process_time = int((end_time.timestamp() - start_time.timestamp()) * 1000)
|
||||
await self.after_request(request, response, process_time)
|
||||
return response
|
||||
65
app/core/proxy_rule.py
Normal file
65
app/core/proxy_rule.py
Normal file
@ -0,0 +1,65 @@
|
||||
import json
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from app.models.cleaning import ProxyProvider
|
||||
|
||||
|
||||
def _resolve_path(data: Any, path: str) -> Any:
|
||||
if not path:
|
||||
return data
|
||||
current = data
|
||||
for part in path.split("."):
|
||||
if isinstance(current, list):
|
||||
try:
|
||||
index = int(part)
|
||||
except ValueError:
|
||||
return None
|
||||
if index < 0 or index >= len(current):
|
||||
return None
|
||||
current = current[index]
|
||||
elif isinstance(current, dict):
|
||||
current = current.get(part)
|
||||
else:
|
||||
return None
|
||||
return current
|
||||
|
||||
|
||||
def parse_proxies(raw_body: str, provider: ProxyProvider) -> List[str]:
|
||||
mode = provider.mode
|
||||
template = provider.template or "{ip}:{port}"
|
||||
result: List[str] = []
|
||||
if mode == "json":
|
||||
obj = json.loads(raw_body)
|
||||
items = _resolve_path(obj, provider.list_path) if provider.list_path else obj
|
||||
if items is None:
|
||||
return result
|
||||
if not isinstance(items, list):
|
||||
items = [items]
|
||||
for item in items:
|
||||
context: Dict[str, Any] = {}
|
||||
if provider.ip_path:
|
||||
context["ip"] = _resolve_path(item, provider.ip_path)
|
||||
if provider.port_path:
|
||||
context["port"] = _resolve_path(item, provider.port_path)
|
||||
if provider.username_path:
|
||||
context["username"] = _resolve_path(item, provider.username_path)
|
||||
if provider.password_path:
|
||||
context["password"] = _resolve_path(item, provider.password_path)
|
||||
result.append(template.format(**context))
|
||||
return result
|
||||
if mode == "text":
|
||||
if not provider.pattern:
|
||||
return result
|
||||
pattern = re.compile(provider.pattern)
|
||||
for match in pattern.finditer(raw_body):
|
||||
context = match.groupdict()
|
||||
result.append(template.format(**context))
|
||||
return result
|
||||
return result
|
||||
|
||||
|
||||
async def parse_proxies_with_provider(provider_id: int, raw_body: str) -> List[str]:
|
||||
provider = await ProxyProvider.get(id=provider_id)
|
||||
return parse_proxies(raw_body, provider)
|
||||
|
||||
352
app/core/scheduler.py
Normal file
352
app/core/scheduler.py
Normal file
@ -0,0 +1,352 @@
|
||||
import asyncio
|
||||
import json
|
||||
import uuid
|
||||
from datetime import datetime, timedelta
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from apscheduler.schedulers.asyncio import AsyncIOScheduler
|
||||
from apscheduler.triggers.cron import CronTrigger
|
||||
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.core.locks import DistributedLock
|
||||
from app.log import logger
|
||||
from app.settings.config import settings
|
||||
from app.models.metrics import ScheduledTaskRun, StatsTotal
|
||||
|
||||
|
||||
scheduler: AsyncIOScheduler | None = None
|
||||
|
||||
|
||||
async def _record_task_run(task_id: str, task_name: str, status: str, started_at: datetime, error: str | None = None):
|
||||
"""记录任务运行状态"""
|
||||
finished_at = datetime.now()
|
||||
duration_ms = int((finished_at.timestamp() - started_at.timestamp()) * 1000)
|
||||
await ScheduledTaskRun.create(
|
||||
task_id=task_id,
|
||||
task_name=task_name,
|
||||
status=status,
|
||||
started_at=started_at,
|
||||
finished_at=finished_at,
|
||||
duration_ms=duration_ms,
|
||||
error=error or "",
|
||||
)
|
||||
|
||||
|
||||
async def stats_job():
|
||||
"""每6小时执行一次:统计 ClickHouse 各表总量并上报"""
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "stats_job"
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=600)
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
logger.info("stats_job skipped: lock not acquired")
|
||||
return
|
||||
try:
|
||||
client = await clickhouse_manager.get_client()
|
||||
tables = [
|
||||
("boss", "job", "boss_job"),
|
||||
("qcwy", "job", "qcwy_job"),
|
||||
("zhilian", "job", "zhilian_job"),
|
||||
("boss", "company", "boss_company"),
|
||||
("qcwy", "company", "qcwy_company"),
|
||||
("zhilian", "company", "zhilian_company"),
|
||||
]
|
||||
results: list[dict] = []
|
||||
for source, data_type, table in tables:
|
||||
total_sql = f"SELECT COUNT() AS cnt FROM job_data.{table}"
|
||||
total_rows = await client.query(total_sql)
|
||||
total_count = int(total_rows.result_rows[0][0]) if total_rows.result_rows else 0
|
||||
await StatsTotal.create(source=source, table_type=data_type, count=total_count, ts=datetime.now())
|
||||
|
||||
daily_sql = (
|
||||
f"SELECT COUNT() AS cnt FROM job_data.{table} "
|
||||
f"WHERE created_at >= toStartOfDay(now()) AND created_at < toStartOfDay(now()) + INTERVAL 1 DAY"
|
||||
)
|
||||
daily_rows = await client.query(daily_sql)
|
||||
daily_count = int(daily_rows.result_rows[0][0]) if daily_rows.result_rows else 0
|
||||
|
||||
results.append({
|
||||
"source": source,
|
||||
"type": data_type,
|
||||
"table": table,
|
||||
"total": total_count,
|
||||
"daily_new": daily_count,
|
||||
})
|
||||
|
||||
payload = {
|
||||
"task_id": task_id,
|
||||
"ts": datetime.now().isoformat(),
|
||||
"totals": results,
|
||||
}
|
||||
await _post_with_retry(json.dumps(payload))
|
||||
await _send_email("6小时数据统计", payload)
|
||||
await _record_task_run(task_id, task_name, "success", started_at)
|
||||
except Exception as e:
|
||||
logger.error(f"stats_job failed: {e}")
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def ip_alert_job():
|
||||
"""每10分钟执行:检测最近窗口未上报的IP并告警"""
|
||||
from app.models.metrics import IpUploadStats # 延迟导入避免循环
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "ip_alert_job"
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=300)
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
logger.info("ip_alert_job skipped: lock not acquired")
|
||||
return
|
||||
try:
|
||||
window_minutes = getattr(settings, "ALERT_WINDOW_MINUTES", 10)
|
||||
# 使用timezone-aware datetime,避免与数据库中的datetime比较时出错
|
||||
from datetime import timezone
|
||||
# 创建aware datetime(UTC时区)
|
||||
now = datetime.now(timezone.utc)
|
||||
threshold = now - timedelta(minutes=window_minutes)
|
||||
cutoff = now.date()
|
||||
items = await IpUploadStats.filter(date=cutoff).all()
|
||||
anomalies: list[dict] = []
|
||||
for item in items:
|
||||
last_at = getattr(item, "last_report_at", None)
|
||||
# 如果last_at是naive datetime,转换为aware datetime进行比较
|
||||
if last_at is not None:
|
||||
# 检查是否是naive datetime(没有tzinfo)
|
||||
if last_at.tzinfo is None:
|
||||
# 假设数据库存储的是UTC时间,转换为aware datetime
|
||||
last_at = last_at.replace(tzinfo=timezone.utc)
|
||||
if last_at is None or last_at < threshold:
|
||||
if getattr(item, "status", "normal") != "abnormal":
|
||||
item.status = "abnormal"
|
||||
await item.save(update_fields=["status"])
|
||||
anomalies.append({
|
||||
"source": item.source,
|
||||
"ip": item.ip,
|
||||
"last_report_at": last_at.isoformat() if last_at else None,
|
||||
"window_minutes": window_minutes,
|
||||
})
|
||||
if anomalies:
|
||||
payload = {"task_id": task_id, "ts": datetime.now().isoformat(), "anomalies": anomalies}
|
||||
await _post_with_retry(json.dumps(payload))
|
||||
await _send_email("IP上报异常告警", payload)
|
||||
duration = (datetime.now() - started_at).total_seconds()
|
||||
logger.info(f"ip_alert_job completed in {duration:.2f} seconds")
|
||||
await _record_task_run(task_id, task_name, "success", started_at)
|
||||
except Exception as e:
|
||||
logger.error(f"ip_alert_job failed: {e}")
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def ecs_full_pipeline_job():
|
||||
"""每6小时执行一次:运行 ecs_full_pipeline.py 完整流程并记录结果"""
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "ecs_full_pipeline"
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=1800)
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
logger.info("ecs_full_pipeline skipped: lock not acquired")
|
||||
return
|
||||
try:
|
||||
root = Path(__file__).resolve().parents[2]
|
||||
script = root / "ecs_full_pipeline.py"
|
||||
log = root / "ecs_full_pipeline.log"
|
||||
with open(log, "a", encoding="utf-8") as f:
|
||||
f.write(f"\n[定时] 开始执行 pipeline:{started_at.isoformat()}\n")
|
||||
proc = await asyncio.to_thread(
|
||||
subprocess.run,
|
||||
[sys.executable, "-u", str(script)],
|
||||
stdout=f,
|
||||
stderr=f,
|
||||
text=True,
|
||||
)
|
||||
status = "success" if proc.returncode == 0 else "fail"
|
||||
await _record_task_run(task_id, task_name, status, started_at, None if status == "success" else f"rc={proc.returncode}")
|
||||
except Exception as e:
|
||||
logger.error(f"ecs_full_pipeline failed: {e}")
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def company_cleaning_job():
|
||||
"""每5分钟执行:自动清洗待处理公司数据"""
|
||||
from app.services.company_cleaner import company_cleaner
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "company_cleaning_job"
|
||||
# Use a shorter lock TTL since it runs frequently
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=300)
|
||||
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
logger.info("company_cleaning_job skipped: lock not acquired")
|
||||
return
|
||||
|
||||
try:
|
||||
logger.info("Running automated company cleaning job...")
|
||||
# 1. Collect new data (with 7-day rule)
|
||||
# 减少数量,确保在5分钟内完成
|
||||
await company_cleaner.collect_pending_companies(limit=50)
|
||||
|
||||
# 2. Process pending data with small delay to be polite
|
||||
# 减少数量,确保在5分钟内完成(30个公司,每个约3-5秒,加上延迟,总计约2-3分钟)
|
||||
# 这样留出时间给收集任务和其他操作
|
||||
await company_cleaner.process_pending_companies(limit=30, max_delay_seconds=1)
|
||||
|
||||
duration = (datetime.now() - started_at).total_seconds()
|
||||
logger.info(f"company_cleaning_job completed in {duration:.2f} seconds")
|
||||
await _record_task_run(task_id, task_name, "success", started_at)
|
||||
except Exception as e:
|
||||
logger.error(f"company_cleaning_job failed: {e}")
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def daily_cleanup_job():
|
||||
"""每天 00:05 执行:清理已完成的任务记录"""
|
||||
from app.services.company_cleaner import company_cleaner
|
||||
|
||||
task_id = str(uuid.uuid4())
|
||||
started_at = datetime.now()
|
||||
task_name = "daily_cleanup_job"
|
||||
lock = DistributedLock(name=task_name, ttl_seconds=3600)
|
||||
|
||||
async with lock.context() as acquired:
|
||||
if not acquired:
|
||||
return
|
||||
|
||||
try:
|
||||
logger.info("Running daily cleanup job...")
|
||||
await company_cleaner.cleanup_old_records()
|
||||
await _record_task_run(task_id, task_name, "success", started_at)
|
||||
except Exception as e:
|
||||
logger.error(f"daily_cleanup_job failed: {e}")
|
||||
await _record_task_run(task_id, task_name, "fail", started_at, error=str(e))
|
||||
|
||||
|
||||
async def _post_with_retry(body: str):
|
||||
"""带失败重试的统计结果上报"""
|
||||
import httpx
|
||||
endpoint = getattr(settings, "REPORT_ENDPOINT", "")
|
||||
if not endpoint:
|
||||
logger.warning("REPORT_ENDPOINT not configured; skip reporting")
|
||||
return
|
||||
max_retries = getattr(settings, "REPORT_MAX_RETRIES", 3)
|
||||
timeout = getattr(settings, "REPORT_TIMEOUT", 10)
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
for attempt in range(1, max_retries + 1):
|
||||
try:
|
||||
resp = await client.post(endpoint, headers={"Content-Type": "application/json"}, content=body)
|
||||
if 200 <= resp.status_code < 300:
|
||||
return
|
||||
raise RuntimeError(f"status={resp.status_code} body={resp.text}")
|
||||
except Exception as e:
|
||||
logger.warning(f"report attempt {attempt} failed: {e}")
|
||||
await asyncio.sleep(min(5 * attempt, 15))
|
||||
|
||||
|
||||
def _build_email_html(subject: str, payload: dict) -> str:
|
||||
"""构建HTML邮件内容"""
|
||||
ts = payload.get("ts") or datetime.now().isoformat()
|
||||
style = (
|
||||
"body{font-family:Arial,Helvetica,sans-serif;background:#f7f7f9;color:#333;}"
|
||||
"h1{font-size:20px;margin:0 0 10px;}"
|
||||
"p.meta{color:#666;font-size:12px;margin:0 0 16px;}"
|
||||
"table{border-collapse:collapse;width:100%;background:#fff;border:1px solid #e5e7eb;}"
|
||||
"th,td{border:1px solid #e5e7eb;padding:8px;text-align:left;font-size:13px;}"
|
||||
"th{background:#f3f4f6;}"
|
||||
".section{margin-top:18px;}"
|
||||
".badge{display:inline-block;background:#2563eb;color:#fff;border-radius:12px;padding:2px 8px;font-size:12px;margin-left:8px;}"
|
||||
)
|
||||
html_head = f"<h1>{subject}<span class=\"badge\">{ts}</span></h1><p class=\"meta\">自动统计与通知</p>"
|
||||
if "totals" in payload:
|
||||
rows = "".join(
|
||||
f"<tr><td>{r.get('source')}</td><td>{r.get('type')}</td><td>{r.get('table')}</td><td>{r.get('total')}</td><td>{r.get('daily_new')}</td></tr>"
|
||||
for r in payload.get("totals", [])
|
||||
)
|
||||
table = f"<table><thead><tr><th>来源</th><th>类型</th><th>表名</th><th>总量</th><th>今日新增</th></tr></thead><tbody>{rows}</tbody></table>"
|
||||
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
|
||||
if "anomalies" in payload:
|
||||
rows = "".join(
|
||||
f"<tr><td>{a.get('source')}</td><td>{a.get('ip')}</td><td>{a.get('date')}</td></tr>" for a in payload.get("anomalies", [])
|
||||
)
|
||||
table = f"<table><thead><tr><th>来源</th><th>IP</th><th>日期</th></tr></thead><tbody>{rows}</tbody></table>"
|
||||
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{table}</body></html>"
|
||||
body = json.dumps(payload, ensure_ascii=False, indent=2)
|
||||
pre = f"<pre style='background:#111827;color:#e5e7eb;padding:12px;border-radius:6px;overflow:auto;font-size:12px;'>{body}</pre>"
|
||||
return f"<html><head><meta charset='utf-8'><style>{style}</style></head><body>{html_head}{pre}</body></html>"
|
||||
|
||||
|
||||
async def _send_email(subject: str, payload: dict):
|
||||
"""发送HTML邮件通知"""
|
||||
import smtplib
|
||||
from email.mime.text import MIMEText
|
||||
from email.utils import formataddr
|
||||
|
||||
host = getattr(settings, "SMTP_HOST", "")
|
||||
user = getattr(settings, "SMTP_USER", "")
|
||||
password = getattr(settings, "SMTP_PASS", "")
|
||||
sender = getattr(settings, "SMTP_FROM", user)
|
||||
recipients = getattr(settings, "SMTP_TO", ["zfc9393@163.com"]) or ["zfc9393@163.com"]
|
||||
if not host or not user or not password:
|
||||
logger.warning("SMTP not configured; skip email sending")
|
||||
return
|
||||
|
||||
html = _build_email_html(subject, payload)
|
||||
msg = MIMEText(html, "html", "utf-8")
|
||||
msg["Subject"] = subject
|
||||
msg["From"] = formataddr(("JobData", sender))
|
||||
msg["To"] = ", ".join(recipients)
|
||||
|
||||
try:
|
||||
server = smtplib.SMTP(host, getattr(settings, "SMTP_PORT", 587))
|
||||
server.starttls()
|
||||
server.login(user, password)
|
||||
server.sendmail(sender, recipients, msg.as_string())
|
||||
server.quit()
|
||||
except Exception as e:
|
||||
logger.error(f"email send failed: {e}")
|
||||
|
||||
|
||||
def start_scheduler():
|
||||
"""启动全局调度器并注册任务"""
|
||||
global scheduler
|
||||
if scheduler is not None:
|
||||
return
|
||||
# 配置调度器,允许任务延迟执行,减少missed警告
|
||||
# 注意:max_instances设置为3,允许任务排队,实际并发控制通过分布式锁实现
|
||||
scheduler = AsyncIOScheduler(
|
||||
job_defaults={
|
||||
'coalesce': True, # 合并多个待执行的任务
|
||||
'max_instances': 3, # 允许最多3个实例排队,实际并发由分布式锁控制
|
||||
'misfire_grace_time': 600 # 允许600秒的延迟,避免missed警告(10分钟)
|
||||
}
|
||||
)
|
||||
# 每6小时触发
|
||||
scheduler.add_job(stats_job, CronTrigger(second=0, minute=0, hour="*/6"), id="stats_job", replace_existing=True)
|
||||
# 每6小时触发:执行 ECS 全流程
|
||||
scheduler.add_job(ecs_full_pipeline_job, CronTrigger(second=0, minute=0, hour="*/6"), id="ecs_full_pipeline", replace_existing=True)
|
||||
# 每10分钟触发告警
|
||||
scheduler.add_job(ip_alert_job, CronTrigger(second=0, minute="*/10"), id="ip_alert_job", replace_existing=True)
|
||||
# 每5分钟执行:自动清洗
|
||||
# 使用max_instances=3允许任务排队,但分布式锁确保同一时间只有一个实例真正执行
|
||||
scheduler.add_job(
|
||||
company_cleaning_job,
|
||||
CronTrigger(second=0, minute="*/5"),
|
||||
id="company_cleaning_job",
|
||||
replace_existing=True,
|
||||
max_instances=3 # 允许最多3个实例排队,实际执行由分布式锁控制
|
||||
)
|
||||
# 每天 00:05 执行:清理历史记录
|
||||
scheduler.add_job(daily_cleanup_job, CronTrigger(second=0, minute=5, hour=0), id="daily_cleanup_job", replace_existing=True)
|
||||
scheduler.start()
|
||||
|
||||
|
||||
def shutdown_scheduler():
|
||||
"""关闭调度器"""
|
||||
global scheduler
|
||||
if scheduler is not None:
|
||||
scheduler.shutdown(wait=False)
|
||||
scheduler = None
|
||||
@ -0,0 +1,33 @@
|
||||
# ALIGNMENT — qcwy_signature_refactor
|
||||
|
||||
阶段: Align(对齐阶段)
|
||||
|
||||
任务名称: 前程无忧 qcwy_api 签名函数提取与爬虫代码重构(风格对齐 boss_api.py)
|
||||
|
||||
1. 项目上下文分析
|
||||
- 技术栈: Python, httpx/requests, FastAPI(服务端),前端 Vue3(web)
|
||||
- 相关文件: new2025/boss/boss_api.py(风格参考)、new2025/qcwy/qcwy_api.py(待重构)
|
||||
- 现有模式: qcwy_api.py 内部包含 ProxyConfig/ProxyManager/RemoteReporter/SignatureGenerator/JobCrawler 等类,使用 httpx.Client 进行请求;boss_api.py 使用 requests.Session 并提供统一日志/调试输出风格。
|
||||
- 业务域: 爬取 51job 小程序接口的职位数据,并将职位与公司详情按约定上报至后端 universal data 接口。
|
||||
|
||||
2. 原始需求与边界确认
|
||||
- 需求: 在 qcwy_api.py 中“提取爬虫的签名函数”,并进行“qcwy_api 重构”,使“代码风格和 boss_api 一致”,且“只需要替换爬虫相关的代码”。
|
||||
- 边界: 不改动非爬虫相关的逻辑(如数据结构、业务上报接口契约),尽量避免引入新的外部依赖;维持 httpx 客户端使用与现有方法签名一致性。
|
||||
|
||||
3. 需求理解
|
||||
- 提取签名函数: 将签名生成逻辑抽到独立函数 generate_qcwy_signature,支持 url_path + 请求体签名;布尔值需转 "true"/"false";HMAC-SHA256 hexdigest。
|
||||
- 提取 property 构建函数: 独立函数 build_qcwy_property,输出 URL 编码的 JSON;JobCrawler.build_property 调用该函数。
|
||||
- 风格对齐: 顶部打印 [DEBUG] 模块加载提示;logging.basicConfig 增加 StreamHandler;函数注释采用 boss_api.py 风格。
|
||||
|
||||
4. 智能决策策略
|
||||
- 参考 boss_api.py 的结构与日志风格,不强行将 httpx 改成 requests,以减少改动面与风险;仅在签名与 property 构建、日志风格上进行统一。
|
||||
- 复用现有接口调用流程,确保对外契约不变。
|
||||
|
||||
5. 疑问澄清(需确认)
|
||||
- 是否需要将 httpx.Client 改为 requests.Session 以进一步风格一致?当前按“只替换爬虫相关代码”理解为保持 httpx。
|
||||
- 签名 key 是否固定由调用方维护?目前保留原始默认值并通过依赖注入传入。
|
||||
|
||||
6. 验收标准
|
||||
- qcwy_api.py 中存在可复用的 generate_qcwy_signature 与 build_qcwy_property 函数,并在 JobCrawler 内部调用。
|
||||
- 日志与 debug 输出风格与 boss_api.py 保持一致。
|
||||
- 现有功能(如 get_recommend_jobs)保持原有行为,不破坏接口契约与上报逻辑。
|
||||
@ -0,0 +1,25 @@
|
||||
# CONSENSUS — qcwy_signature_refactor
|
||||
|
||||
阶段: Align 最终共识
|
||||
|
||||
明确的需求与验收标准
|
||||
- 需求: 提取 51job 爬虫的签名函数与 property 构建函数为独立方法,并在 JobCrawler 内复用;对齐 boss_api.py 的日志与调试输出风格;仅修改与爬虫相关的代码。
|
||||
- 验收标准:
|
||||
- 存在 generate_qcwy_signature(sign_key, url_path, data) 独立函数,输出与原有签名一致。
|
||||
- 存在 build_qcwy_property(manual_login_method, page_code) 独立函数,并在 JobCrawler.build_property 内复用。
|
||||
- 在模块顶部添加 [DEBUG] 输出与 logging StreamHandler,日志风格一致。
|
||||
- 不影响对外调用契约与数据上报逻辑。
|
||||
|
||||
技术实现方案与约束
|
||||
- 保持 httpx.Client 以减少非爬虫相关的改动;签名与 property 独立函数供各 API 组装复用。
|
||||
- 函数注释与类型标注齐全;布尔值在签名前转换为字符串,确保线上一致性。
|
||||
- 不引入新依赖,严格遵守 .env 管理敏感配置的规范。
|
||||
|
||||
集成方案
|
||||
- JobCrawler 内部继续使用 self.signature_generator.generate_signature 与 self.build_property,但内部转调独立函数,保证现有方法签名不变。
|
||||
|
||||
边界与限制
|
||||
- 不调整代理与上报接口的现有实现;如需进一步统一至 requests.Session,需单独评审并更新 DESIGN/TASK 文档。
|
||||
|
||||
不确定性状态
|
||||
- 当前均已解决;若后续出现兼容性问题,将在 TASK/ACCEPTANCE 文档中记录并处置。
|
||||
@ -0,0 +1,60 @@
|
||||
# DESIGN — qcwy_signature_refactor
|
||||
|
||||
1) 整体架构图
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[JobCrawler 调用层] --> B[签名生成 generate_qcwy_signature]
|
||||
A --> C[property 构建 build_qcwy_property]
|
||||
A --> D[httpx.Client 请求发送]
|
||||
D --> E[51job API]
|
||||
A --> F[RemoteReporter 数据上报]
|
||||
```
|
||||
|
||||
2) 分层设计与核心组件
|
||||
- Crawler 层:JobCrawler(业务编排、参数组装、headers签名与property、调试打印、调用 _make_request)
|
||||
- Signature 层:generate_qcwy_signature(独立函数,HMAC-SHA256签名)+ SignatureGenerator(兼容旧接口)
|
||||
- Property 层:build_qcwy_property(独立函数,URL编码后的JSON)
|
||||
- Transport 层:httpx.Client(保留,会话与代理切换)
|
||||
- Reporter 层:RemoteReporter(上报到后端 universal data)
|
||||
|
||||
3) 模块依赖关系图
|
||||
```mermaid
|
||||
flowchart LR
|
||||
JobCrawler --> SignatureGenerator --> generate_qcwy_signature
|
||||
JobCrawler --> build_qcwy_property
|
||||
JobCrawler --> httpxClient
|
||||
JobCrawler --> RemoteReporter
|
||||
```
|
||||
|
||||
4) 接口契约定义
|
||||
- generate_qcwy_signature(sign_key: str, url_path: str, data: Optional[Dict]) -> str
|
||||
- build_qcwy_property(manual_login_method: str = "", page_code: str = "home|hotjob|jobfxlist") -> str
|
||||
- SignatureGenerator.generate_signature(url_path: str, data: Optional[Dict]) -> str
|
||||
|
||||
5) 数据流向图
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant J as JobCrawler
|
||||
participant S as Signature
|
||||
participant P as Property
|
||||
participant X as httpx.Client
|
||||
participant API as 51job API
|
||||
participant R as RemoteReporter
|
||||
|
||||
J->>S: 生成签名
|
||||
J->>P: 构建 property
|
||||
J->>X: 发送请求(headers含签名与property)
|
||||
X->>API: HTTP 请求
|
||||
API-->>J: 返回 JSON 响应
|
||||
J->>R: 上报数据(批量)
|
||||
```
|
||||
|
||||
6) 异常处理策略
|
||||
- 代理异常:在 _make_request 中检测典型错误码/异常关键字,触发 _switch_proxy_and_reinit。
|
||||
- 网络异常:指数退避重试,最多3次;返回 None 时上层做空值兜底。
|
||||
- 解析异常:捕获并记录日志,返回空集合/空字典,避免中断流水线。
|
||||
|
||||
7) 设计原则
|
||||
- 任务范围内的最小改动:仅替换爬虫相关签名与property构建逻辑。
|
||||
- 保持对外契约不变:保留 JobCrawler 与 Reporter 行为接口。
|
||||
- 与 boss_api 风格对齐:日志配置、调试输出统一。
|
||||
@ -0,0 +1,33 @@
|
||||
# TASK — qcwy_signature_refactor
|
||||
|
||||
1) 原子任务拆分
|
||||
- T1 提取独立签名函数 generate_qcwy_signature(输入:sign_key/url_path/data;输出:签名字符串;约束:HMAC-SHA256)
|
||||
- T2 提取独立 property 构建函数 build_qcwy_property(输入:manual_login_method/page_code;输出:URL编码字符串)
|
||||
- T3 JobCrawler 复用新函数并统一调试输出(输入:URL/headers/body;输出:HTTP响应)
|
||||
- T4 扩充函数级注释(方法与参数/返回说明),满足代码输出标准
|
||||
- T5 编写单元测试:签名一致性/布尔值转换/属性编码有效性
|
||||
- T6 文档与进度同步:ALIGNMENT/CONSENSUS/DESIGN/TASK/ACCEPTANCE/FINAL/TODO
|
||||
|
||||
2) 依赖关系
|
||||
```mermaid
|
||||
flowchart TD
|
||||
T1 --> T3
|
||||
T2 --> T3
|
||||
T3 --> T4
|
||||
T3 --> T5
|
||||
T4 --> T6
|
||||
T5 --> T6
|
||||
```
|
||||
|
||||
3) 任务详细约束
|
||||
- 输入契约:
|
||||
- T1: sign_key(url_path,data)
|
||||
- T2: manual_login_method,page_code
|
||||
- T3: 复用T1/T2后组装请求
|
||||
- 输出契约:
|
||||
- T1/T2: 字符串且可用于HTTP headers
|
||||
- T3: 成功返回JSON字典或空值兜底
|
||||
- 实现约束:
|
||||
- 与 boss_api 风格统一;不变更对外接口;不引入新依赖
|
||||
- 验收标准:
|
||||
- 单元测试通过;日志与调试输出统一;文档完整
|
||||
297
app/docs/universal_data_api_usage.md
Normal file
297
app/docs/universal_data_api_usage.md
Normal file
@ -0,0 +1,297 @@
|
||||
# 通用数据路由API使用指南
|
||||
|
||||
## 概述
|
||||
|
||||
通用数据路由系统提供了一个统一的API接口,可以根据数据类型和平台自动将数据路由到对应的ClickHouse表中进行存储。这个系统支持所有主要的招聘平台(Boss直聘、前程无忧、智联招聘)和数据类型(职位、公司)。
|
||||
|
||||
## 核心特性
|
||||
|
||||
- **统一接口**: 所有平台的数据都通过相同的API接口处理
|
||||
- **自动路由**: 根据数据类型和平台自动选择对应的存储表
|
||||
- **重复检查**: 支持自动重复数据检查,避免数据冗余
|
||||
- **批量处理**: 支持单条和批量数据处理
|
||||
- **异步处理**: 支持异步后台任务处理大量数据
|
||||
- **数据预处理**: 自动进行字段映射和数据转换
|
||||
|
||||
## API端点
|
||||
|
||||
### 基础路径
|
||||
```
|
||||
/api/v1/universal
|
||||
```
|
||||
|
||||
### 主要端点
|
||||
|
||||
1. **存储单条数据**: `POST /data/store`
|
||||
2. **批量存储数据**: `POST /data/batch-store`
|
||||
3. **异步存储单条数据**: `POST /data/store-async`
|
||||
4. **异步批量存储数据**: `POST /data/batch-store-async`
|
||||
5. **获取支持的平台**: `GET /platforms`
|
||||
|
||||
### 平台特定便捷端点
|
||||
|
||||
- `POST /boss/job` - Boss直聘职位数据
|
||||
- `POST /boss/company` - Boss直聘公司数据
|
||||
- `POST /qcwy/job` - 前程无忧职位数据
|
||||
- `POST /qcwy/company` - 前程无忧公司数据
|
||||
- `POST /zhilian/job` - 智联招聘职位数据
|
||||
- `POST /zhilian/company` - 智联招聘公司数据
|
||||
|
||||
## 数据类型和平台
|
||||
|
||||
### 支持的平台
|
||||
- `boss` - Boss直聘
|
||||
- `qcwy` - 前程无忧
|
||||
- `zhilian` - 智联招聘
|
||||
|
||||
### 支持的数据类型
|
||||
- `job` - 职位数据
|
||||
- `company` - 公司数据
|
||||
|
||||
### 默认重复检查字段
|
||||
|
||||
| 平台 | 职位数据 | 公司数据 |
|
||||
|------|----------|----------|
|
||||
| boss | encrypt_job_id | encrypt_id |
|
||||
| qcwy | job_id | company_id |
|
||||
| zhilian | job_id | company_id |
|
||||
|
||||
## 使用示例
|
||||
|
||||
### 1. 存储Boss直聘职位数据
|
||||
|
||||
```json
|
||||
POST /api/v1/universal/data/store
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"data": {
|
||||
"securityId": "abc123",
|
||||
"jobBaseInfoVO": {
|
||||
"encryptJobId": "job_encrypt_123",
|
||||
"jobId": 12345,
|
||||
"positionName": "Python开发工程师",
|
||||
"locationName": "北京",
|
||||
"lowSalary": 15000,
|
||||
"highSalary": 25000,
|
||||
"jobDesc": "负责后端开发工作..."
|
||||
},
|
||||
"bossBaseInfoVO": {
|
||||
"encryptBossId": "boss_encrypt_456",
|
||||
"bossId": 67890,
|
||||
"bossName": "张经理"
|
||||
},
|
||||
"brandComInfoVO": {
|
||||
"brandName": "某科技公司",
|
||||
"industryName": "互联网",
|
||||
"scaleName": "100-499人"
|
||||
}
|
||||
},
|
||||
"data_type": "job",
|
||||
"platform": "boss",
|
||||
"check_duplicate": true,
|
||||
"duplicate_key": "encrypt_job_id"
|
||||
}
|
||||
```
|
||||
|
||||
### 2. 批量存储前程无忧职位数据
|
||||
|
||||
```json
|
||||
POST /api/v1/universal/data/batch-store
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"data_list": [
|
||||
{
|
||||
"job_name": "Java开发工程师",
|
||||
"brand_name": "某互联网公司",
|
||||
"job_addr": "上海",
|
||||
"salary_desc": "15k-25k",
|
||||
"job_desc": "负责Java后端开发..."
|
||||
},
|
||||
{
|
||||
"job_name": "前端开发工程师",
|
||||
"brand_name": "某科技公司",
|
||||
"job_addr": "深圳",
|
||||
"salary_desc": "12k-20k",
|
||||
"job_desc": "负责前端页面开发..."
|
||||
}
|
||||
],
|
||||
"data_type": "job",
|
||||
"platform": "qcwy",
|
||||
"check_duplicate": true
|
||||
}
|
||||
```
|
||||
|
||||
### 3. 使用便捷端点存储数据
|
||||
|
||||
```json
|
||||
POST /api/v1/universal/boss/job
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"securityId": "abc123",
|
||||
"jobBaseInfoVO": {
|
||||
"encryptJobId": "job_encrypt_789",
|
||||
"positionName": "产品经理",
|
||||
"locationName": "杭州"
|
||||
},
|
||||
"brandComInfoVO": {
|
||||
"brandName": "某电商公司",
|
||||
"industryName": "电子商务"
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### 4. 异步批量处理大量数据
|
||||
|
||||
```json
|
||||
POST /api/v1/universal/data/batch-store-async
|
||||
Content-Type: application/json
|
||||
|
||||
{
|
||||
"data_list": [
|
||||
// ... 大量数据
|
||||
],
|
||||
"data_type": "job",
|
||||
"platform": "zhilian",
|
||||
"check_duplicate": true
|
||||
}
|
||||
```
|
||||
|
||||
## 响应格式
|
||||
|
||||
### 成功响应
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 200,
|
||||
"message": "数据存储成功",
|
||||
"data": {
|
||||
"success": true,
|
||||
"message": "数据存储成功",
|
||||
"duplicate": false,
|
||||
"result": true
|
||||
},
|
||||
"platform": "boss",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
### 重复数据响应
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 400,
|
||||
"message": "数据已存在",
|
||||
"data": {
|
||||
"success": false,
|
||||
"message": "数据已存在",
|
||||
"duplicate": true,
|
||||
"existing_id": 12345
|
||||
},
|
||||
"platform": "boss",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
### 批量处理响应
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 200,
|
||||
"message": "批量处理完成: 成功 8 条,失败 1 条,重复 1 条",
|
||||
"data": {
|
||||
"total": 10,
|
||||
"success": 8,
|
||||
"failed": 1,
|
||||
"duplicate": 1,
|
||||
"errors": [
|
||||
{
|
||||
"index": 5,
|
||||
"error": "缺少必要字段"
|
||||
}
|
||||
]
|
||||
},
|
||||
"platform": "qcwy",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
### 异步处理响应
|
||||
|
||||
```json
|
||||
{
|
||||
"code": 202,
|
||||
"message": "数据已加入异步处理队列",
|
||||
"platform": "zhilian",
|
||||
"data_type": "job"
|
||||
}
|
||||
```
|
||||
|
||||
## 数据预处理
|
||||
|
||||
系统会根据不同平台自动进行数据预处理:
|
||||
|
||||
### Boss直聘数据预处理
|
||||
- 从嵌套的VO对象中提取字段
|
||||
- 映射到标准的数据库字段
|
||||
- 保留原始数据在`raw_data`字段中
|
||||
|
||||
### 前程无忧数据预处理
|
||||
- 直接映射字段名
|
||||
- 处理数组类型字段
|
||||
- 标准化日期格式
|
||||
|
||||
### 智联招聘数据预处理
|
||||
- 处理复杂的嵌套结构
|
||||
- 提取关键信息字段
|
||||
- 转换数据类型
|
||||
|
||||
## 错误处理
|
||||
|
||||
### 常见错误码
|
||||
|
||||
- `400` - 请求参数错误或数据已存在
|
||||
- `500` - 服务器内部错误
|
||||
- `202` - 异步处理已接受
|
||||
|
||||
### 错误响应示例
|
||||
|
||||
```json
|
||||
{
|
||||
"detail": "数据存储失败: 缺少必要字段 'encrypt_job_id'"
|
||||
}
|
||||
```
|
||||
|
||||
## 最佳实践
|
||||
|
||||
1. **使用重复检查**: 始终启用重复检查以避免数据冗余
|
||||
2. **批量处理**: 对于大量数据,使用批量接口提高效率
|
||||
3. **异步处理**: 对于非实时需求,使用异步接口避免超时
|
||||
4. **错误处理**: 妥善处理API返回的错误信息
|
||||
5. **数据验证**: 在发送前验证数据的完整性
|
||||
|
||||
## 监控和日志
|
||||
|
||||
系统会自动记录以下信息:
|
||||
- 数据处理成功/失败统计
|
||||
- 重复数据检测结果
|
||||
- 处理时间和性能指标
|
||||
- 错误详情和堆栈信息
|
||||
|
||||
可以通过应用日志查看详细的处理信息。
|
||||
|
||||
## 迁移指南
|
||||
|
||||
### 从平台特定API迁移
|
||||
|
||||
如果你之前使用平台特定的API(如`/api/v1/boss/job`),可以:
|
||||
|
||||
1. **继续使用便捷端点**: 使用`/api/v1/universal/boss/job`等便捷端点
|
||||
2. **迁移到通用端点**: 使用`/api/v1/universal/data/store`通用端点
|
||||
3. **逐步迁移**: 先测试新接口,确认无误后再完全迁移
|
||||
|
||||
### 数据格式兼容性
|
||||
|
||||
新的通用API完全兼容现有的数据格式,无需修改数据结构。
|
||||
1
app/log/__init__.py
Normal file
1
app/log/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .log import logger as logger
|
||||
25
app/log/log.py
Normal file
25
app/log/log.py
Normal file
@ -0,0 +1,25 @@
|
||||
import sys
|
||||
|
||||
from loguru import logger as loguru_logger
|
||||
|
||||
from app.settings import settings
|
||||
|
||||
|
||||
class Loggin:
|
||||
def __init__(self) -> None:
|
||||
debug = settings.DEBUG
|
||||
if debug:
|
||||
self.level = "DEBUG"
|
||||
else:
|
||||
self.level = "INFO"
|
||||
|
||||
def setup_logger(self):
|
||||
loguru_logger.remove()
|
||||
loguru_logger.add(sink=sys.stdout, level=self.level)
|
||||
|
||||
# logger.add("my_project.log", level=level, rotation="100 MB") # Output log messages to a file
|
||||
return loguru_logger
|
||||
|
||||
|
||||
loggin = Loggin()
|
||||
logger = loggin.setup_logger()
|
||||
5
app/models/__init__.py
Normal file
5
app/models/__init__.py
Normal file
@ -0,0 +1,5 @@
|
||||
# 新增model需要在这里导入
|
||||
from .admin import *
|
||||
from .metrics import *
|
||||
from .keyword import *
|
||||
from .cleaning import *
|
||||
89
app/models/admin.py
Normal file
89
app/models/admin.py
Normal file
@ -0,0 +1,89 @@
|
||||
from tortoise import fields
|
||||
|
||||
from app.schemas.menus import MenuType
|
||||
|
||||
from .base import BaseModel, TimestampMixin
|
||||
from .enums import MethodType
|
||||
|
||||
|
||||
class User(BaseModel, TimestampMixin):
|
||||
username = fields.CharField(max_length=20, unique=True, description="用户名称", index=True)
|
||||
alias = fields.CharField(max_length=30, null=True, description="姓名", index=True)
|
||||
email = fields.CharField(max_length=255, unique=True, description="邮箱", index=True)
|
||||
phone = fields.CharField(max_length=20, null=True, description="电话", index=True)
|
||||
password = fields.CharField(max_length=128, null=True, description="密码")
|
||||
is_active = fields.BooleanField(default=True, description="是否激活", index=True)
|
||||
is_superuser = fields.BooleanField(default=False, description="是否为超级管理员", index=True)
|
||||
last_login = fields.DatetimeField(null=True, description="最后登录时间", index=True)
|
||||
roles = fields.ManyToManyField("models.Role", related_name="user_roles")
|
||||
dept_id = fields.IntField(null=True, description="部门ID", index=True)
|
||||
|
||||
class Meta:
|
||||
table = "user"
|
||||
|
||||
|
||||
class Role(BaseModel, TimestampMixin):
|
||||
name = fields.CharField(max_length=20, unique=True, description="角色名称", index=True)
|
||||
desc = fields.CharField(max_length=500, null=True, description="角色描述")
|
||||
menus = fields.ManyToManyField("models.Menu", related_name="role_menus")
|
||||
apis = fields.ManyToManyField("models.Api", related_name="role_apis")
|
||||
|
||||
class Meta:
|
||||
table = "role"
|
||||
|
||||
|
||||
class Api(BaseModel, TimestampMixin):
|
||||
path = fields.CharField(max_length=100, description="API路径", index=True)
|
||||
method = fields.CharEnumField(MethodType, description="请求方法", index=True)
|
||||
summary = fields.CharField(max_length=500, description="请求简介", index=True)
|
||||
tags = fields.CharField(max_length=100, description="API标签", index=True)
|
||||
|
||||
class Meta:
|
||||
table = "api"
|
||||
|
||||
|
||||
class Menu(BaseModel, TimestampMixin):
|
||||
name = fields.CharField(max_length=20, description="菜单名称", index=True)
|
||||
remark = fields.JSONField(null=True, description="保留字段")
|
||||
menu_type = fields.CharEnumField(MenuType, null=True, description="菜单类型")
|
||||
icon = fields.CharField(max_length=100, null=True, description="菜单图标")
|
||||
path = fields.CharField(max_length=100, description="菜单路径", index=True)
|
||||
order = fields.IntField(default=0, description="排序", index=True)
|
||||
parent_id = fields.IntField(default=0, description="父菜单ID", index=True)
|
||||
is_hidden = fields.BooleanField(default=False, description="是否隐藏")
|
||||
component = fields.CharField(max_length=100, description="组件")
|
||||
keepalive = fields.BooleanField(default=True, description="存活")
|
||||
redirect = fields.CharField(max_length=100, null=True, description="重定向")
|
||||
|
||||
class Meta:
|
||||
table = "menu"
|
||||
|
||||
|
||||
class Dept(BaseModel, TimestampMixin):
|
||||
name = fields.CharField(max_length=20, unique=True, description="部门名称", index=True)
|
||||
desc = fields.CharField(max_length=500, null=True, description="备注")
|
||||
is_deleted = fields.BooleanField(default=False, description="软删除标记", index=True)
|
||||
order = fields.IntField(default=0, description="排序", index=True)
|
||||
parent_id = fields.IntField(default=0, max_length=10, description="父部门ID", index=True)
|
||||
|
||||
class Meta:
|
||||
table = "dept"
|
||||
|
||||
|
||||
class DeptClosure(BaseModel, TimestampMixin):
|
||||
ancestor = fields.IntField(description="父代", index=True)
|
||||
descendant = fields.IntField(description="子代", index=True)
|
||||
level = fields.IntField(default=0, description="深度", index=True)
|
||||
|
||||
|
||||
class AuditLog(BaseModel, TimestampMixin):
|
||||
user_id = fields.IntField(description="用户ID", index=True)
|
||||
username = fields.CharField(max_length=64, default="", description="用户名称", index=True)
|
||||
module = fields.CharField(max_length=64, default="", description="功能模块", index=True)
|
||||
summary = fields.CharField(max_length=128, default="", description="请求描述", index=True)
|
||||
method = fields.CharField(max_length=10, default="", description="请求方法", index=True)
|
||||
path = fields.CharField(max_length=255, default="", description="请求路径", index=True)
|
||||
status = fields.IntField(default=-1, description="状态码", index=True)
|
||||
response_time = fields.IntField(default=0, description="响应时间(单位ms)", index=True)
|
||||
request_args = fields.JSONField(null=True, description="请求参数")
|
||||
response_body = fields.JSONField(null=True, description="返回数据")
|
||||
62
app/models/base.py
Normal file
62
app/models/base.py
Normal file
@ -0,0 +1,62 @@
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
from tortoise import fields, models
|
||||
|
||||
from app.settings import settings
|
||||
|
||||
|
||||
class BaseModel(models.Model):
|
||||
id = fields.BigIntField(pk=True, index=True)
|
||||
|
||||
async def to_dict(self, m2m: bool = False, exclude_fields: list[str] | None = None):
|
||||
if exclude_fields is None:
|
||||
exclude_fields = []
|
||||
|
||||
d = {}
|
||||
for field in self._meta.db_fields:
|
||||
if field not in exclude_fields:
|
||||
value = getattr(self, field)
|
||||
if isinstance(value, datetime):
|
||||
value = value.strftime(settings.DATETIME_FORMAT)
|
||||
d[field] = value
|
||||
|
||||
if m2m:
|
||||
tasks = [
|
||||
self.__fetch_m2m_field(field, exclude_fields)
|
||||
for field in self._meta.m2m_fields
|
||||
if field not in exclude_fields
|
||||
]
|
||||
results = await asyncio.gather(*tasks)
|
||||
for field, values in results:
|
||||
d[field] = values
|
||||
|
||||
return d
|
||||
|
||||
async def __fetch_m2m_field(self, field, exclude_fields):
|
||||
values = await getattr(self, field).all().values()
|
||||
formatted_values = []
|
||||
|
||||
for value in values:
|
||||
formatted_value = {}
|
||||
for k, v in value.items():
|
||||
if k not in exclude_fields:
|
||||
if isinstance(v, datetime):
|
||||
formatted_value[k] = v.strftime(settings.DATETIME_FORMAT)
|
||||
else:
|
||||
formatted_value[k] = v
|
||||
formatted_values.append(formatted_value)
|
||||
|
||||
return field, formatted_values
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
|
||||
|
||||
class UUIDModel:
|
||||
uuid = fields.UUIDField(unique=True, pk=False, index=True)
|
||||
|
||||
|
||||
class TimestampMixin:
|
||||
created_at = fields.DatetimeField(auto_now_add=True, index=True)
|
||||
updated_at = fields.DatetimeField(auto_now=True, index=True)
|
||||
45
app/models/cleaning.py
Normal file
45
app/models/cleaning.py
Normal file
@ -0,0 +1,45 @@
|
||||
from tortoise import fields
|
||||
from app.models.base import TimestampMixin, BaseModel
|
||||
|
||||
|
||||
class CleaningTask(BaseModel, TimestampMixin):
|
||||
target = fields.CharField(max_length=255, description="目标(URL/公司名/ID)")
|
||||
clean_type = fields.CharField(max_length=50, default="auto", description="清洗模式")
|
||||
platform = fields.CharField(max_length=50, default="auto", description="目标平台")
|
||||
proxy = fields.CharField(max_length=255, null=True, description="代理地址")
|
||||
status = fields.CharField(max_length=20, default="pending", description="状态: pending/processing/success/fail")
|
||||
storage_status = fields.CharField(max_length=20, default="unknown", description="存储状态: saved/duplicate/failed/unknown")
|
||||
remote_sent = fields.BooleanField(default=False, description="是否已远程推送")
|
||||
result_summary = fields.JSONField(null=True, description="清洗结果摘要")
|
||||
original_data = fields.JSONField(null=True, description="原始请求数据")
|
||||
error_msg = fields.TextField(null=True, description="错误信息")
|
||||
|
||||
class Meta:
|
||||
table = "cleaning_task"
|
||||
|
||||
|
||||
class ProxyConfig(BaseModel, TimestampMixin):
|
||||
name = fields.CharField(max_length=100, description="名称")
|
||||
proxy_type = fields.CharField(max_length=20, description="代理类型: http/socks/tunnel")
|
||||
platform = fields.CharField(max_length=50, default="all", description="目标平台: boss/qcwy/zhilian/all")
|
||||
proxy_url = fields.CharField(max_length=255, description="代理地址")
|
||||
is_active = fields.BooleanField(default=True, description="是否可用")
|
||||
|
||||
class Meta:
|
||||
table = "proxy_config"
|
||||
|
||||
|
||||
class ProxyProvider(BaseModel, TimestampMixin):
|
||||
name = fields.CharField(max_length=100, description="名称")
|
||||
platform = fields.CharField(max_length=50, default="all", description="目标平台: boss/qcwy/zhilian/all")
|
||||
mode = fields.CharField(max_length=20, description="解析模式: json/text")
|
||||
list_path = fields.CharField(max_length=255, null=True, description="JSON列表路径")
|
||||
ip_path = fields.CharField(max_length=255, null=True, description="IP字段路径")
|
||||
port_path = fields.CharField(max_length=255, null=True, description="端口字段路径")
|
||||
username_path = fields.CharField(max_length=255, null=True, description="用户名字段路径")
|
||||
password_path = fields.CharField(max_length=255, null=True, description="密码字段路径")
|
||||
pattern = fields.TextField(null=True, description="文本解析正则")
|
||||
template = fields.CharField(max_length=255, description="最终代理模板")
|
||||
|
||||
class Meta:
|
||||
table = "proxy_provider"
|
||||
19
app/models/enums.py
Normal file
19
app/models/enums.py
Normal file
@ -0,0 +1,19 @@
|
||||
from enum import Enum, StrEnum
|
||||
|
||||
|
||||
class EnumBase(Enum):
|
||||
@classmethod
|
||||
def get_member_values(cls):
|
||||
return [item.value for item in cls._member_map_.values()]
|
||||
|
||||
@classmethod
|
||||
def get_member_names(cls):
|
||||
return [name for name in cls._member_names_]
|
||||
|
||||
|
||||
class MethodType(StrEnum):
|
||||
GET = "GET"
|
||||
POST = "POST"
|
||||
PUT = "PUT"
|
||||
DELETE = "DELETE"
|
||||
PATCH = "PATCH"
|
||||
31
app/models/keyword.py
Normal file
31
app/models/keyword.py
Normal file
@ -0,0 +1,31 @@
|
||||
from tortoise import fields
|
||||
from tortoise.models import Model
|
||||
|
||||
|
||||
class BaseKeyword(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
city = fields.CharField(max_length=64)
|
||||
job = fields.CharField(max_length=128)
|
||||
last_requested_date = fields.DateField(null=True)
|
||||
last_requested_at = fields.DatetimeField(null=True)
|
||||
created_at = fields.DatetimeField(auto_now_add=True)
|
||||
updated_at = fields.DatetimeField(auto_now=True)
|
||||
|
||||
class Meta:
|
||||
abstract = True
|
||||
|
||||
|
||||
class BossKeyword(BaseKeyword):
|
||||
class Meta:
|
||||
table = "boss_keyword"
|
||||
|
||||
|
||||
class QcwyKeyword(BaseKeyword):
|
||||
class Meta:
|
||||
table = "qcwy_keyword"
|
||||
|
||||
|
||||
class ZhilianKeyword(BaseKeyword):
|
||||
class Meta:
|
||||
table = "zhilian_keyword"
|
||||
|
||||
35
app/models/metrics.py
Normal file
35
app/models/metrics.py
Normal file
@ -0,0 +1,35 @@
|
||||
from tortoise import fields
|
||||
from tortoise.models import Model
|
||||
|
||||
|
||||
class StatsTotal(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
source = fields.CharField(max_length=32)
|
||||
table_type = fields.CharField(max_length=32)
|
||||
count = fields.IntField()
|
||||
ts = fields.DatetimeField()
|
||||
|
||||
|
||||
class IpUploadStats(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
source = fields.CharField(max_length=32)
|
||||
ip = fields.CharField(max_length=64)
|
||||
date = fields.DateField()
|
||||
upload_count = fields.IntField()
|
||||
last_report_at = fields.DatetimeField(null=True)
|
||||
status = fields.CharField(max_length=16, default="normal")
|
||||
created_at = fields.DatetimeField(auto_now_add=True)
|
||||
|
||||
class Meta:
|
||||
unique_together = ("source", "ip", "date")
|
||||
|
||||
|
||||
class ScheduledTaskRun(Model):
|
||||
id = fields.IntField(pk=True)
|
||||
task_id = fields.CharField(max_length=64)
|
||||
task_name = fields.CharField(max_length=64)
|
||||
status = fields.CharField(max_length=32)
|
||||
started_at = fields.DatetimeField()
|
||||
finished_at = fields.DatetimeField()
|
||||
duration_ms = fields.IntField()
|
||||
error = fields.TextField(null=True)
|
||||
17
app/models/token.py
Normal file
17
app/models/token.py
Normal file
@ -0,0 +1,17 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from tortoise import fields
|
||||
from app.models.base import BaseModel, TimestampMixin
|
||||
|
||||
|
||||
class BossToken(BaseModel, TimestampMixin):
|
||||
wt2 = fields.CharField(max_length=200, null=True, description="Boss直聘wt2")
|
||||
mpt = fields.CharField(max_length=200, null=True, description="Boss直聘mpt")
|
||||
is_active = fields.BooleanField(default=True, description="是否可用")
|
||||
failed_count = fields.IntField(default=0, description="失败次数")
|
||||
last_used_time = fields.DatetimeField(null=True, description="最后使用时间")
|
||||
|
||||
class Meta:
|
||||
table = "boss_token"
|
||||
table_description = "Boss直聘token表"
|
||||
173
app/repositories/clickhouse_repo.py
Normal file
173
app/repositories/clickhouse_repo.py
Normal file
@ -0,0 +1,173 @@
|
||||
import math
|
||||
from collections.abc import Generator
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
from clickhouse_connect.driver.query import QueryResult
|
||||
|
||||
|
||||
class ClickHouseBaseRepo:
|
||||
"""ClickHouse基础仓库类"""
|
||||
|
||||
def __init__(self, clickhouse_client: AsyncClient, table_name: str):
|
||||
self._clickhouse_client = clickhouse_client
|
||||
self._table_name = table_name
|
||||
|
||||
async def execute_query(self, query: str, parameters: Optional[Dict[str, Any]] = None) -> QueryResult:
|
||||
"""执行查询"""
|
||||
return await self._clickhouse_client.query(query, parameters=parameters)
|
||||
|
||||
async def execute_insert(self, data: List[Dict[str, Any]]) -> None:
|
||||
"""批量插入数据"""
|
||||
if not data:
|
||||
return
|
||||
|
||||
columns = list(data[0].keys())
|
||||
values = [[row[col] for col in columns] for row in data]
|
||||
|
||||
await self._clickhouse_client.insert(
|
||||
table=self._table_name,
|
||||
data=values,
|
||||
column_names=columns
|
||||
)
|
||||
|
||||
def _build_where_statements(
|
||||
self,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None
|
||||
) -> tuple[List[str], Dict[str, Any]]:
|
||||
"""构建WHERE条件语句"""
|
||||
where_statements = []
|
||||
params = {}
|
||||
|
||||
if filters:
|
||||
for key, value in filters.items():
|
||||
if value is not None:
|
||||
where_statements.append(f"{key} = %({key})s")
|
||||
params[key] = value
|
||||
|
||||
if from_dt:
|
||||
where_statements.append("created_at >= %(from_dt)s")
|
||||
params['from_dt'] = from_dt
|
||||
|
||||
if to_dt:
|
||||
where_statements.append("created_at <= %(to_dt)s")
|
||||
params['to_dt'] = to_dt
|
||||
|
||||
return where_statements, params
|
||||
|
||||
|
||||
class JobAnalyticsRepo(ClickHouseBaseRepo):
|
||||
"""招聘数据分析仓库"""
|
||||
|
||||
def __init__(self, clickhouse_client: AsyncClient):
|
||||
super().__init__(clickhouse_client, "job_analytics")
|
||||
|
||||
async def get_job_count(
|
||||
self,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
) -> int:
|
||||
"""获取职位数量"""
|
||||
where_statements, params = self._build_where_statements(
|
||||
filters=filters, from_dt=from_dt, to_dt=to_dt
|
||||
)
|
||||
|
||||
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
|
||||
query = f"""
|
||||
SELECT COUNT(*)
|
||||
FROM {self._table_name}
|
||||
WHERE {where_clause}
|
||||
"""
|
||||
|
||||
result = await self.execute_query(query, parameters=params)
|
||||
return int(result.result_rows[0][0])
|
||||
|
||||
async def group_jobs_by_column(
|
||||
self,
|
||||
group_by_column: str,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
limit: int = 10
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""按指定列分组统计职位"""
|
||||
where_statements, params = self._build_where_statements(
|
||||
filters=filters, from_dt=from_dt, to_dt=to_dt
|
||||
)
|
||||
|
||||
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
|
||||
query = f"""
|
||||
SELECT
|
||||
{group_by_column} AS category,
|
||||
COUNT(*) AS job_count
|
||||
FROM {self._table_name}
|
||||
WHERE {where_clause}
|
||||
GROUP BY {group_by_column}
|
||||
ORDER BY job_count DESC
|
||||
LIMIT {limit}
|
||||
"""
|
||||
|
||||
result = await self.execute_query(query, parameters=params)
|
||||
return [
|
||||
{"category": row[0], "job_count": int(row[1])}
|
||||
for row in result.result_rows
|
||||
]
|
||||
|
||||
async def get_volume_trend(
|
||||
self,
|
||||
interval: str = "day", # day or hour
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取数据量趋势"""
|
||||
where_statements, params = self._build_where_statements(
|
||||
filters=filters, from_dt=from_dt, to_dt=to_dt
|
||||
)
|
||||
|
||||
where_clause = " AND ".join(where_statements) if where_statements else "1=1"
|
||||
|
||||
if interval == "day":
|
||||
time_func = "toStartOfDay"
|
||||
elif interval == "hour":
|
||||
time_func = "toStartOfHour"
|
||||
elif interval == "week":
|
||||
time_func = "toStartOfWeek"
|
||||
elif interval == "month":
|
||||
time_func = "toStartOfMonth"
|
||||
else:
|
||||
time_func = "toStartOfDay"
|
||||
|
||||
# 使用 toTimeZone 确保聚合按北京时间划分
|
||||
query = f"""
|
||||
SELECT
|
||||
{time_func}(toTimeZone(created_at, 'Asia/Shanghai')) AS time_bucket,
|
||||
source,
|
||||
COUNT(*) AS count
|
||||
FROM {self._table_name}
|
||||
WHERE {where_clause}
|
||||
GROUP BY time_bucket, source
|
||||
ORDER BY time_bucket ASC
|
||||
"""
|
||||
|
||||
result = await self.execute_query(query, parameters=params)
|
||||
return [
|
||||
{
|
||||
"time": row[0].isoformat(),
|
||||
"source": row[1],
|
||||
"count": int(row[2])
|
||||
}
|
||||
for row in result.result_rows
|
||||
]
|
||||
|
||||
async def get_source_distribution(
|
||||
self,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取数据来源分布"""
|
||||
return await self.group_jobs_by_column("source", filters, from_dt, to_dt)
|
||||
1
app/schemas/__init__.py
Normal file
1
app/schemas/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
from .base import *
|
||||
76
app/schemas/analytics.py
Normal file
76
app/schemas/analytics.py
Normal file
@ -0,0 +1,76 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class AnalyticsQueryParams(BaseModel):
|
||||
"""分析查询参数"""
|
||||
from_date: Optional[datetime] = Field(None, description="开始日期")
|
||||
to_date: Optional[datetime] = Field(None, description="结束日期")
|
||||
city: Optional[str] = Field(None, description="城市筛选")
|
||||
company_name: Optional[str] = Field(None, description="公司名称筛选")
|
||||
position_name: Optional[str] = Field(None, description="职位名称筛选")
|
||||
industry: Optional[str] = Field(None, description="行业筛选")
|
||||
experience_required: Optional[str] = Field(None, description="经验要求筛选")
|
||||
limit: int = Field(10, ge=1, le=100, description="返回结果数量限制")
|
||||
|
||||
|
||||
class SalaryStatistics(BaseModel):
|
||||
"""薪资统计信息"""
|
||||
avg_salary_min: float = Field(description="最低薪资平均值")
|
||||
avg_salary_max: float = Field(description="最高薪资平均值")
|
||||
min_salary: float = Field(description="最低薪资")
|
||||
max_salary: float = Field(description="最高薪资")
|
||||
total_jobs: int = Field(description="有薪资信息的职位总数")
|
||||
|
||||
|
||||
class JobStatisticsResponse(BaseModel):
|
||||
"""职位统计响应"""
|
||||
total_jobs: int = Field(description="职位总数")
|
||||
period: Dict[str, Optional[str]] = Field(description="统计时间段")
|
||||
|
||||
|
||||
class CategoryCount(BaseModel):
|
||||
"""分类统计"""
|
||||
category: str = Field(description="分类名称")
|
||||
job_count: int = Field(description="职位数量")
|
||||
|
||||
|
||||
class TopCompaniesResponse(BaseModel):
|
||||
"""热门公司响应"""
|
||||
companies: List[CategoryCount] = Field(description="公司列表")
|
||||
total_count: int = Field(description="总数")
|
||||
|
||||
|
||||
class TopPositionsResponse(BaseModel):
|
||||
"""热门职位响应"""
|
||||
positions: List[CategoryCount] = Field(description="职位列表")
|
||||
total_count: int = Field(description="总数")
|
||||
|
||||
|
||||
class CityDistributionResponse(BaseModel):
|
||||
"""城市分布响应"""
|
||||
cities: List[CategoryCount] = Field(description="城市列表")
|
||||
total_count: int = Field(description="总数")
|
||||
|
||||
|
||||
class IndustryDistributionResponse(BaseModel):
|
||||
"""行业分布响应"""
|
||||
industries: List[CategoryCount] = Field(description="行业列表")
|
||||
total_count: int = Field(description="总数")
|
||||
|
||||
|
||||
class ExperienceDistributionResponse(BaseModel):
|
||||
"""经验要求分布响应"""
|
||||
experience_levels: List[CategoryCount] = Field(description="经验要求列表")
|
||||
total_count: int = Field(description="总数")
|
||||
|
||||
|
||||
class AnalyticsDashboardResponse(BaseModel):
|
||||
"""分析仪表板响应"""
|
||||
job_statistics: JobStatisticsResponse = Field(description="职位统计")
|
||||
top_companies: List[CategoryCount] = Field(description="热门公司")
|
||||
top_positions: List[CategoryCount] = Field(description="热门职位")
|
||||
city_distribution: List[CategoryCount] = Field(description="城市分布")
|
||||
industry_distribution: List[CategoryCount] = Field(description="行业分布")
|
||||
experience_distribution: List[CategoryCount] = Field(description="经验要求分布")
|
||||
17
app/schemas/apis.py
Normal file
17
app/schemas/apis.py
Normal file
@ -0,0 +1,17 @@
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.models.enums import MethodType
|
||||
|
||||
|
||||
class BaseApi(BaseModel):
|
||||
path: str = Field(..., description="API路径", example="/api/v1/user/list")
|
||||
summary: str = Field("", description="API简介", example="查看用户列表")
|
||||
method: MethodType = Field(..., description="API方法", example="GET")
|
||||
tags: str = Field(..., description="API标签", example="User")
|
||||
|
||||
|
||||
class ApiCreate(BaseApi): ...
|
||||
|
||||
|
||||
class ApiUpdate(BaseApi):
|
||||
id: int
|
||||
56
app/schemas/base.py
Normal file
56
app/schemas/base.py
Normal file
@ -0,0 +1,56 @@
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
|
||||
class Success(JSONResponse):
|
||||
def __init__(
|
||||
self,
|
||||
code: int = 200,
|
||||
msg: Optional[str] = "OK",
|
||||
data: Optional[Any] = None,
|
||||
**kwargs,
|
||||
):
|
||||
content = {"code": code, "msg": msg, "data": data}
|
||||
content.update(kwargs)
|
||||
encoded_content = jsonable_encoder(content)
|
||||
super().__init__(content=encoded_content, status_code=code)
|
||||
|
||||
|
||||
class Fail(JSONResponse):
|
||||
def __init__(
|
||||
self,
|
||||
code: int = 400,
|
||||
msg: Optional[str] = None,
|
||||
data: Optional[Any] = None,
|
||||
**kwargs,
|
||||
):
|
||||
content = {"code": code, "msg": msg, "data": data}
|
||||
content.update(kwargs)
|
||||
encoded_content = jsonable_encoder(content)
|
||||
super().__init__(content=encoded_content, status_code=code)
|
||||
|
||||
|
||||
class SuccessExtra(JSONResponse):
|
||||
def __init__(
|
||||
self,
|
||||
code: int = 200,
|
||||
msg: Optional[str] = None,
|
||||
data: Optional[Any] = None,
|
||||
total: int = 0,
|
||||
page: int = 1,
|
||||
page_size: int = 20,
|
||||
**kwargs,
|
||||
):
|
||||
content = {
|
||||
"code": code,
|
||||
"msg": msg,
|
||||
"data": data,
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
}
|
||||
content.update(kwargs)
|
||||
encoded_content = jsonable_encoder(content)
|
||||
super().__init__(content=encoded_content, status_code=code)
|
||||
29
app/schemas/cleaning.py
Normal file
29
app/schemas/cleaning.py
Normal file
@ -0,0 +1,29 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, Any
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
class CleaningTaskBase(BaseModel):
|
||||
target: str = Field(description="目标(URL/公司名/ID)")
|
||||
clean_type: str = Field("auto", description="清洗模式")
|
||||
platform: str = Field("auto", description="目标平台")
|
||||
proxy: Optional[str] = Field(None, description="代理地址")
|
||||
status: str = Field("pending", description="状态: pending/processing/success/fail")
|
||||
storage_status: str = Field("unknown", description="存储状态: saved/duplicate/failed/unknown")
|
||||
remote_sent: bool = Field(False, description="是否已远程推送")
|
||||
result_summary: Optional[Any] = Field(None, description="清洗结果摘要")
|
||||
original_data: Optional[Any] = Field(None, description="原始请求数据")
|
||||
error_msg: Optional[str] = Field(None, description="错误信息")
|
||||
|
||||
class CleaningTaskCreate(CleaningTaskBase):
|
||||
pass
|
||||
|
||||
class CleaningTaskUpdate(CleaningTaskBase):
|
||||
pass
|
||||
|
||||
class CleaningTaskOut(CleaningTaskBase):
|
||||
id: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
18
app/schemas/depts.py
Normal file
18
app/schemas/depts.py
Normal file
@ -0,0 +1,18 @@
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class BaseDept(BaseModel):
|
||||
name: str = Field(..., description="部门名称", example="研发中心")
|
||||
desc: str = Field("", description="备注", example="研发中心")
|
||||
order: int = Field(0, description="排序")
|
||||
parent_id: int = Field(0, description="父部门ID")
|
||||
|
||||
|
||||
class DeptCreate(BaseDept): ...
|
||||
|
||||
|
||||
class DeptUpdate(BaseDept):
|
||||
id: int
|
||||
|
||||
def update_dict(self):
|
||||
return self.model_dump(exclude_unset=True, exclude={"id"})
|
||||
29
app/schemas/keyword.py
Normal file
29
app/schemas/keyword.py
Normal file
@ -0,0 +1,29 @@
|
||||
from datetime import date, datetime
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class KeywordBase(BaseModel):
|
||||
city: str = Field(..., max_length=64, description="城市")
|
||||
job: str = Field(..., max_length=128, description="职位关键词")
|
||||
|
||||
|
||||
class KeywordCreate(KeywordBase):
|
||||
pass
|
||||
|
||||
|
||||
class KeywordUpdate(BaseModel):
|
||||
city: Optional[str] = Field(None, max_length=64, description="城市")
|
||||
job: Optional[str] = Field(None, max_length=128, description="职位关键词")
|
||||
|
||||
|
||||
class KeywordOut(KeywordBase):
|
||||
id: int
|
||||
last_requested_date: Optional[date] = None
|
||||
last_requested_at: Optional[datetime] = None
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
20
app/schemas/login.py
Normal file
20
app/schemas/login.py
Normal file
@ -0,0 +1,20 @@
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class CredentialsSchema(BaseModel):
|
||||
username: str = Field(..., description="用户名称", example="admin")
|
||||
password: str = Field(..., description="密码", example="123456")
|
||||
|
||||
|
||||
class JWTOut(BaseModel):
|
||||
access_token: str
|
||||
username: str
|
||||
|
||||
|
||||
class JWTPayload(BaseModel):
|
||||
user_id: int
|
||||
username: str
|
||||
is_superuser: bool
|
||||
exp: datetime
|
||||
52
app/schemas/menus.py
Normal file
52
app/schemas/menus.py
Normal file
@ -0,0 +1,52 @@
|
||||
from enum import StrEnum
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class MenuType(StrEnum):
|
||||
CATALOG = "catalog" # 目录
|
||||
MENU = "menu" # 菜单
|
||||
|
||||
|
||||
class BaseMenu(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
path: str
|
||||
remark: Optional[dict]
|
||||
menu_type: Optional[MenuType]
|
||||
icon: Optional[str]
|
||||
order: int
|
||||
parent_id: int
|
||||
is_hidden: bool
|
||||
component: str
|
||||
keepalive: bool
|
||||
redirect: Optional[str]
|
||||
children: Optional[list["BaseMenu"]]
|
||||
|
||||
|
||||
class MenuCreate(BaseModel):
|
||||
menu_type: MenuType = Field(default=MenuType.CATALOG.value)
|
||||
name: str = Field(example="用户管理")
|
||||
icon: Optional[str] = "ph:user-list-bold"
|
||||
path: str = Field(example="/system/user")
|
||||
order: Optional[int] = Field(example=1)
|
||||
parent_id: Optional[int] = Field(example=0, default=0)
|
||||
is_hidden: Optional[bool] = False
|
||||
component: str = Field(default="Layout", example="/system/user")
|
||||
keepalive: Optional[bool] = True
|
||||
redirect: Optional[str] = ""
|
||||
|
||||
|
||||
class MenuUpdate(BaseModel):
|
||||
id: int
|
||||
menu_type: Optional[MenuType] = Field(example=MenuType.CATALOG.value)
|
||||
name: Optional[str] = Field(example="用户管理")
|
||||
icon: Optional[str] = "ph:user-list-bold"
|
||||
path: Optional[str] = Field(example="/system/user")
|
||||
order: Optional[int] = Field(example=1)
|
||||
parent_id: Optional[int] = Field(example=0)
|
||||
is_hidden: Optional[bool] = False
|
||||
component: str = Field(example="/system/user")
|
||||
keepalive: Optional[bool] = False
|
||||
redirect: Optional[str] = ""
|
||||
33
app/schemas/proxy.py
Normal file
33
app/schemas/proxy.py
Normal file
@ -0,0 +1,33 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ProxyConfigBase(BaseModel):
|
||||
name: str = Field(description="名称")
|
||||
proxy_type: str = Field(description="代理类型: http/socks/tunnel")
|
||||
platform: str = Field("all", description="目标平台: boss/qcwy/zhilian/all")
|
||||
proxy_url: str = Field(description="代理地址")
|
||||
is_active: bool = Field(True, description="是否可用")
|
||||
|
||||
|
||||
class ProxyConfigCreate(ProxyConfigBase):
|
||||
pass
|
||||
|
||||
|
||||
class ProxyConfigUpdate(BaseModel):
|
||||
id: int = Field(description="主键ID")
|
||||
name: Optional[str] = Field(None, description="名称")
|
||||
proxy_type: Optional[str] = Field(None, description="代理类型: http/socks/tunnel")
|
||||
platform: Optional[str] = Field(None, description="目标平台: boss/qcwy/zhilian/all")
|
||||
proxy_url: Optional[str] = Field(None, description="代理地址")
|
||||
is_active: Optional[bool] = Field(None, description="是否可用")
|
||||
|
||||
|
||||
class ProxyConfigOut(ProxyConfigBase):
|
||||
id: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
45
app/schemas/proxy_provider.py
Normal file
45
app/schemas/proxy_provider.py
Normal file
@ -0,0 +1,45 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class ProxyProviderBase(BaseModel):
|
||||
name: str = Field(description="名称")
|
||||
platform: str = Field("all", description="目标平台: boss/qcwy/zhilian/all")
|
||||
mode: str = Field("json", description="解析模式: json/text")
|
||||
list_path: Optional[str] = Field(None, description="JSON列表路径")
|
||||
ip_path: Optional[str] = Field(None, description="IP字段路径")
|
||||
port_path: Optional[str] = Field(None, description="端口字段路径")
|
||||
username_path: Optional[str] = Field(None, description="用户名字段路径")
|
||||
password_path: Optional[str] = Field(None, description="密码字段路径")
|
||||
pattern: Optional[str] = Field(None, description="文本解析正则")
|
||||
template: str = Field(description="最终代理模板")
|
||||
|
||||
|
||||
class ProxyProviderCreate(ProxyProviderBase):
|
||||
pass
|
||||
|
||||
|
||||
class ProxyProviderUpdate(BaseModel):
|
||||
id: int = Field(description="主键ID")
|
||||
name: Optional[str] = Field(None, description="名称")
|
||||
platform: Optional[str] = Field(None, description="目标平台: boss/qcwy/zhilian/all")
|
||||
mode: Optional[str] = Field(None, description="解析模式: json/text")
|
||||
list_path: Optional[str] = Field(None, description="JSON列表路径")
|
||||
ip_path: Optional[str] = Field(None, description="IP字段路径")
|
||||
port_path: Optional[str] = Field(None, description="端口字段路径")
|
||||
username_path: Optional[str] = Field(None, description="用户名字段路径")
|
||||
password_path: Optional[str] = Field(None, description="密码字段路径")
|
||||
pattern: Optional[str] = Field(None, description="文本解析正则")
|
||||
template: Optional[str] = Field(None, description="最终代理模板")
|
||||
|
||||
|
||||
class ProxyProviderOut(ProxyProviderBase):
|
||||
id: int
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
32
app/schemas/roles.py
Normal file
32
app/schemas/roles.py
Normal file
@ -0,0 +1,32 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class BaseRole(BaseModel):
|
||||
id: int
|
||||
name: str
|
||||
desc: str = ""
|
||||
users: Optional[list] = []
|
||||
menus: Optional[list] = []
|
||||
apis: Optional[list] = []
|
||||
created_at: Optional[datetime]
|
||||
updated_at: Optional[datetime]
|
||||
|
||||
|
||||
class RoleCreate(BaseModel):
|
||||
name: str = Field(example="管理员")
|
||||
desc: str = Field("", example="管理员角色")
|
||||
|
||||
|
||||
class RoleUpdate(BaseModel):
|
||||
id: int = Field(example=1)
|
||||
name: str = Field(example="管理员")
|
||||
desc: str = Field("", example="管理员角色")
|
||||
|
||||
|
||||
class RoleUpdateMenusApis(BaseModel):
|
||||
id: int
|
||||
menu_ids: list[int] = []
|
||||
api_infos: list[dict] = []
|
||||
36
app/schemas/token.py
Normal file
36
app/schemas/token.py
Normal file
@ -0,0 +1,36 @@
|
||||
from pydantic import BaseModel, Field
|
||||
from datetime import datetime
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
|
||||
class BossTokenCreate(BaseModel):
|
||||
wt2: str = Field(..., description="Boss直聘wt2")
|
||||
mpt: str = Field(..., description="Boss直聘mpt")
|
||||
is_active: bool = Field(True, description="是否可用")
|
||||
|
||||
|
||||
class BossTokenUpdate(BaseModel):
|
||||
wt2: Optional[str] = Field(None, description="Boss直聘wt2")
|
||||
mpt: Optional[str] = Field(None, description="Boss直聘mpt")
|
||||
is_active: Optional[bool] = Field(None, description="是否可用")
|
||||
failed_count: Optional[int] = Field(None, description="失败次数")
|
||||
last_used_time: Optional[datetime] = Field(None, description="最后使用时间")
|
||||
|
||||
|
||||
class BossTokenResponse(BaseModel):
|
||||
id: int
|
||||
wt2: Optional[str]
|
||||
mpt: Optional[str]
|
||||
is_active: bool
|
||||
failed_count: int
|
||||
last_used_time: Optional[datetime]
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
|
||||
class Config:
|
||||
from_attributes = True
|
||||
|
||||
|
||||
class BossTokenList(BaseModel):
|
||||
total: int
|
||||
items: List[BossTokenResponse]
|
||||
44
app/schemas/users.py
Normal file
44
app/schemas/users.py
Normal file
@ -0,0 +1,44 @@
|
||||
from datetime import datetime
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel, EmailStr, Field
|
||||
|
||||
|
||||
class BaseUser(BaseModel):
|
||||
id: int
|
||||
email: Optional[EmailStr] = None
|
||||
username: Optional[str] = None
|
||||
is_active: Optional[bool] = True
|
||||
is_superuser: Optional[bool] = False
|
||||
created_at: Optional[datetime]
|
||||
updated_at: Optional[datetime]
|
||||
last_login: Optional[datetime]
|
||||
roles: Optional[list] = []
|
||||
|
||||
|
||||
class UserCreate(BaseModel):
|
||||
email: EmailStr = Field(example="admin@qq.com")
|
||||
username: str = Field(example="admin")
|
||||
password: str = Field(example="123456")
|
||||
is_active: Optional[bool] = True
|
||||
is_superuser: Optional[bool] = False
|
||||
role_ids: Optional[List[int]] = []
|
||||
dept_id: Optional[int] = Field(0, description="部门ID")
|
||||
|
||||
def create_dict(self):
|
||||
return self.model_dump(exclude_unset=True, exclude={"role_ids"})
|
||||
|
||||
|
||||
class UserUpdate(BaseModel):
|
||||
id: int
|
||||
email: EmailStr
|
||||
username: str
|
||||
is_active: Optional[bool] = True
|
||||
is_superuser: Optional[bool] = False
|
||||
role_ids: Optional[List[int]] = []
|
||||
dept_id: Optional[int] = 0
|
||||
|
||||
|
||||
class UpdatePassword(BaseModel):
|
||||
old_password: str = Field(description="旧密码")
|
||||
new_password: str = Field(description="新密码")
|
||||
58
app/services/analytics_service.py
Normal file
58
app/services/analytics_service.py
Normal file
@ -0,0 +1,58 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
from clickhouse_connect.driver import AsyncClient
|
||||
from app.repositories.clickhouse_repo import JobAnalyticsRepo
|
||||
|
||||
|
||||
class AnalyticsService:
|
||||
"""数据分析服务"""
|
||||
|
||||
def __init__(self, clickhouse_client: AsyncClient):
|
||||
self.job_repo = JobAnalyticsRepo(clickhouse_client)
|
||||
|
||||
async def get_job_statistics(
|
||||
self,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""获取职位统计信息(仅返回总量)"""
|
||||
total_jobs = await self.job_repo.get_job_count(
|
||||
filters=filters, from_dt=from_dt, to_dt=to_dt
|
||||
)
|
||||
|
||||
return {
|
||||
"total_jobs": total_jobs,
|
||||
"period": {
|
||||
"from_date": from_dt.isoformat() if from_dt else None,
|
||||
"to_date": to_dt.isoformat() if to_dt else None
|
||||
}
|
||||
}
|
||||
|
||||
async def get_volume_trend(
|
||||
self,
|
||||
interval: str = "day",
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取数据量趋势"""
|
||||
return await self.job_repo.get_volume_trend(
|
||||
interval=interval,
|
||||
filters=filters,
|
||||
from_dt=from_dt,
|
||||
to_dt=to_dt
|
||||
)
|
||||
|
||||
async def get_source_distribution(
|
||||
self,
|
||||
filters: Optional[Dict[str, Any]] = None,
|
||||
from_dt: Optional[datetime] = None,
|
||||
to_dt: Optional[datetime] = None,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""获取数据来源分布"""
|
||||
return await self.job_repo.get_source_distribution(
|
||||
filters=filters,
|
||||
from_dt=from_dt,
|
||||
to_dt=to_dt
|
||||
)
|
||||
363
app/services/cleaning.py
Normal file
363
app/services/cleaning.py
Normal file
@ -0,0 +1,363 @@
|
||||
import csv
|
||||
import io
|
||||
import re
|
||||
from typing import List, Dict, Any, Union, Optional
|
||||
from fastapi import UploadFile
|
||||
from loguru import logger
|
||||
from app.services.crawler.boss import BossService
|
||||
from app.services.crawler.qcwy import QcwyService
|
||||
from app.services.crawler.zhilian import ZhilianService
|
||||
from app.services.job import DataRouterService, DataType, PlatformType
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.models.token import BossToken
|
||||
from jobs_spider.qcwy.search_company_jobs import _extract_items as qcwy_extract_items
|
||||
|
||||
class CleaningService:
|
||||
def __init__(self):
|
||||
self.boss_service = BossService()
|
||||
self.qcwy_service = QcwyService()
|
||||
self.zhilian_service = ZhilianService()
|
||||
self.data_router = None
|
||||
self._boss_token_loaded = False
|
||||
|
||||
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
||||
self.boss_service.set_proxy(proxy)
|
||||
self.qcwy_service.set_proxy(proxy)
|
||||
self.zhilian_service.set_proxy(proxy)
|
||||
|
||||
async def _ensure_boss_token_loaded(self) -> None:
|
||||
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
|
||||
return
|
||||
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
||||
if not token_obj:
|
||||
logger.warning("BossToken not found or inactive")
|
||||
return
|
||||
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
||||
self._boss_token_loaded = True
|
||||
|
||||
async def get_data_router(self) -> DataRouterService:
|
||||
if not self.data_router:
|
||||
client = await clickhouse_manager.get_client()
|
||||
self.data_router = DataRouterService(client)
|
||||
return self.data_router
|
||||
|
||||
async def parse_file(self, file: UploadFile) -> List[str]:
|
||||
content = await file.read()
|
||||
filename = file.filename
|
||||
|
||||
targets = []
|
||||
if filename.endswith('.csv'):
|
||||
text = content.decode('utf-8')
|
||||
# Handle BOM
|
||||
if text.startswith('\uFEFF'):
|
||||
text = text[1:]
|
||||
reader = csv.reader(io.StringIO(text))
|
||||
for row in reader:
|
||||
if row:
|
||||
targets.append(row[0].strip())
|
||||
else:
|
||||
text = content.decode('utf-8')
|
||||
targets = [line.strip() for line in text.splitlines() if line.strip()]
|
||||
|
||||
return [t for t in targets if t]
|
||||
|
||||
async def process_single_item(self, target: str, clean_type: str = "auto", platform: str = "auto", proxy: Optional[str] = None) -> Dict[str, Any]:
|
||||
try:
|
||||
await self._ensure_boss_token_loaded()
|
||||
self._apply_proxy(proxy)
|
||||
result = None
|
||||
if clean_type == "auto":
|
||||
result = await self.clean_target_auto(target)
|
||||
elif clean_type == "clean_url":
|
||||
if platform == "auto":
|
||||
result = await self.clean_target_auto(target)
|
||||
elif platform == "boss":
|
||||
result = await self._process_boss_url(target)
|
||||
elif platform == "qcwy":
|
||||
result = await self._process_qcwy_url(target)
|
||||
elif platform == "zhilian":
|
||||
result = await self._process_zhilian_url(target)
|
||||
elif clean_type == "job_id":
|
||||
result = await self.clean_by_job_id(target, platform)
|
||||
elif clean_type == "company_name":
|
||||
result = await self.clean_by_company_name(target, platform)
|
||||
elif clean_type == "company_id":
|
||||
result = await self.clean_by_company_id(target, platform)
|
||||
elif clean_type == "company_jobs":
|
||||
if platform == "boss":
|
||||
result = await self.clean_boss_company_jobs(target)
|
||||
elif platform == "qcwy":
|
||||
result = await self.clean_qcwy_company_jobs(target)
|
||||
elif platform == "zhilian":
|
||||
result = await self.clean_zhilian_company_jobs(target)
|
||||
|
||||
if not result:
|
||||
return {
|
||||
"success": False,
|
||||
"target": target,
|
||||
"error": "No data found or operation failed",
|
||||
"storage_status": "failed",
|
||||
"remote_sent": False
|
||||
}
|
||||
|
||||
# Normalize result if it's just a dict (from store_data)
|
||||
# If it's a boolean (from some legacy paths), wrap it
|
||||
if isinstance(result, bool):
|
||||
return {
|
||||
"success": result,
|
||||
"target": target,
|
||||
"error": None if result else "Operation failed",
|
||||
"storage_status": "unknown",
|
||||
"remote_sent": False
|
||||
}
|
||||
|
||||
# If it's the dict returned by DataRouterService.store_data
|
||||
return {
|
||||
"success": result.get("success", False),
|
||||
"target": target,
|
||||
"error": result.get("message") if not result.get("success") else None,
|
||||
"storage_status": "duplicate" if result.get("duplicate") else "saved",
|
||||
"remote_sent": result.get("remote_sent", False),
|
||||
"data_summary": result.get("data_summary"), # Optional: summary of data
|
||||
"original_data": result.get("original_data")
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing item {target}: {e}")
|
||||
return {
|
||||
"success": False,
|
||||
"target": target,
|
||||
"error": str(e),
|
||||
"storage_status": "error",
|
||||
"remote_sent": False
|
||||
}
|
||||
|
||||
async def clean_target_auto(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
if "zhipin.com" in target:
|
||||
return await self._process_boss_url(target)
|
||||
elif "51job.com" in target:
|
||||
return await self._process_qcwy_url(target)
|
||||
elif "zhaopin.com" in target:
|
||||
return await self._process_zhilian_url(target)
|
||||
return await self._process_search_company(target)
|
||||
|
||||
async def clean_by_job_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
data = None
|
||||
result = None
|
||||
|
||||
# Try to extract ID if target looks like a URL
|
||||
if platform == "boss":
|
||||
match = re.search(r'job_detail/([^.]+)\.html', target)
|
||||
if match:
|
||||
target = match.group(1)
|
||||
elif platform == "qcwy":
|
||||
match = re.search(r'/(\d+)\.html', target)
|
||||
if match:
|
||||
target = match.group(1)
|
||||
elif platform == "zhilian":
|
||||
match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', target)
|
||||
if match:
|
||||
target = match.group(1)
|
||||
|
||||
if platform == "boss":
|
||||
data = self.boss_service.get_job_detail_by_id(target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.JOB, PlatformType.BOSS)
|
||||
elif platform == "qcwy":
|
||||
data = self.qcwy_service.get_job_detail(target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.JOB, PlatformType.QCWY)
|
||||
elif platform == "zhilian":
|
||||
data = self.zhilian_service.get_job_detail(target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.JOB, PlatformType.ZHILIAN)
|
||||
|
||||
if result and isinstance(result, dict) and data:
|
||||
result['original_data'] = data
|
||||
return result
|
||||
|
||||
return False
|
||||
|
||||
async def clean_by_company_name(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
if platform == "boss":
|
||||
res = self.boss_service.search_jobs(target)
|
||||
if res and res.get('zpData') and res['zpData'].get('list'):
|
||||
# For company name search, we might get multiple jobs.
|
||||
# Currently we just return the result of the LAST one for simplicity in status reporting,
|
||||
# or we should change logic to handle list.
|
||||
# For now, let's just process them and return the last result as indicative.
|
||||
last_result = None
|
||||
for job in res['zpData']['list']:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
# For search results, we store the full search response as original data
|
||||
last_result['original_data'] = res
|
||||
return last_result if last_result else False
|
||||
elif platform == "qcwy":
|
||||
res = self.qcwy_service.search_jobs(target)
|
||||
if res:
|
||||
last_result = None
|
||||
for job in res:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result['original_data'] = res
|
||||
return last_result if last_result else False
|
||||
elif platform == "zhilian":
|
||||
res = self.zhilian_service.search_company_jobs_by_name(target)
|
||||
if res and isinstance(res, dict):
|
||||
data = res.get("data") or {}
|
||||
items = data.get("list") or []
|
||||
if not isinstance(items, list):
|
||||
items = []
|
||||
last_result = None
|
||||
for job in items:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = res
|
||||
return last_result if last_result else False
|
||||
return False
|
||||
|
||||
async def clean_by_company_id(self, target: str, platform: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
data = None
|
||||
result = None
|
||||
|
||||
if platform == "boss":
|
||||
data = self.boss_service.get_company_detail_by_id(target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.COMPANY, PlatformType.BOSS)
|
||||
elif platform == "qcwy":
|
||||
company_id = target
|
||||
match = re.match(r"^co(\d+)$", company_id)
|
||||
if match:
|
||||
company_id = match.group(1)
|
||||
data = self.qcwy_service.get_company_info(company_id)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.COMPANY, PlatformType.QCWY)
|
||||
elif platform == "zhilian":
|
||||
data = self.zhilian_service.get_company_detail(target)
|
||||
if data:
|
||||
result = await router.store_data(data, DataType.COMPANY, PlatformType.ZHILIAN)
|
||||
|
||||
if result and isinstance(result, dict) and data:
|
||||
result['original_data'] = data
|
||||
return result
|
||||
|
||||
return False
|
||||
|
||||
async def clean_boss_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
company_id = target
|
||||
|
||||
match = re.search(r'gongsi/([^.]+)\.html', target)
|
||||
if match:
|
||||
company_id = match.group(1)
|
||||
|
||||
data = self.boss_service.get_company_jobs_by_id(company_id)
|
||||
if not data:
|
||||
return False
|
||||
|
||||
jobs = []
|
||||
zp_data = data.get("zpData") if isinstance(data, dict) else None
|
||||
if isinstance(zp_data, dict):
|
||||
if isinstance(zp_data.get("jobList"), list):
|
||||
jobs = zp_data.get("jobList") or []
|
||||
elif isinstance(zp_data.get("list"), list):
|
||||
jobs = zp_data.get("list") or []
|
||||
|
||||
if not jobs:
|
||||
return False
|
||||
|
||||
last_result: Optional[Dict[str, Any]] = None
|
||||
for job in jobs:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.BOSS)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = data
|
||||
return last_result
|
||||
|
||||
return False
|
||||
|
||||
async def clean_qcwy_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
company_id = target
|
||||
|
||||
match = re.match(r'^co(\d+)$', company_id)
|
||||
if match:
|
||||
company_id = match.group(1)
|
||||
|
||||
data = self.qcwy_service.get_company_jobs_by_id(company_id)
|
||||
if not data:
|
||||
return False
|
||||
|
||||
jobs_list = qcwy_extract_items(data)
|
||||
jobs: List[Dict[str, Any]] = jobs_list if isinstance(jobs_list, list) else []
|
||||
|
||||
if not jobs:
|
||||
return False
|
||||
|
||||
last_result: Optional[Dict[str, Any]] = None
|
||||
for job in jobs:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.QCWY)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = data
|
||||
return last_result
|
||||
|
||||
return False
|
||||
|
||||
async def clean_zhilian_company_jobs(self, target: str) -> Union[bool, Dict[str, Any]]:
|
||||
router = await self.get_data_router()
|
||||
company_id = target
|
||||
|
||||
data = self.zhilian_service.get_company_jobs_by_id(company_id)
|
||||
if not data or not isinstance(data, dict):
|
||||
return False
|
||||
|
||||
data_field = data.get("data") or {}
|
||||
jobs = data_field.get("list") or []
|
||||
if not isinstance(jobs, list) or not jobs:
|
||||
return False
|
||||
|
||||
last_result: Optional[Dict[str, Any]] = None
|
||||
for job in jobs:
|
||||
last_result = await router.store_data(job, DataType.JOB, PlatformType.ZHILIAN)
|
||||
|
||||
if last_result and isinstance(last_result, dict):
|
||||
last_result["original_data"] = data
|
||||
return last_result
|
||||
|
||||
return False
|
||||
|
||||
async def _process_boss_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
||||
job_match = re.search(r'job_detail/([^.]+)\.html', url)
|
||||
if job_match:
|
||||
return await self.clean_by_job_id(job_match.group(1), "boss")
|
||||
|
||||
company_match = re.search(r'gongsi/([^.]+)\.html', url)
|
||||
if company_match:
|
||||
return await self.clean_by_company_id(company_match.group(1), "boss")
|
||||
|
||||
# Fallback: assume it's a job ID
|
||||
return await self.clean_by_job_id(url, "boss")
|
||||
|
||||
async def _process_qcwy_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
||||
job_match = re.search(r'/(\d+)\.html', url)
|
||||
if job_match:
|
||||
return await self.clean_by_job_id(job_match.group(1), "qcwy")
|
||||
# Fallback: assume it's a job ID
|
||||
return await self.clean_by_job_id(url, "qcwy")
|
||||
|
||||
async def _process_zhilian_url(self, url: str) -> Union[bool, Dict[str, Any]]:
|
||||
job_match = re.search(r'jobs\.zhaopin\.com/(\w+)\.htm', url)
|
||||
if job_match:
|
||||
return await self.clean_by_job_id(job_match.group(1), "zhilian")
|
||||
# Fallback: assume it's a job ID
|
||||
return await self.clean_by_job_id(url, "zhilian")
|
||||
|
||||
async def _process_search_company(self, name: str) -> Union[bool, Dict[str, Any]]:
|
||||
return await self.clean_by_company_name(name, "boss")
|
||||
645
app/services/company_cleaner.py
Normal file
645
app/services/company_cleaner.py
Normal file
@ -0,0 +1,645 @@
|
||||
import asyncio
|
||||
import json
|
||||
import random
|
||||
from datetime import datetime
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from loguru import logger
|
||||
|
||||
from app.core.clickhouse import clickhouse_manager
|
||||
from app.models.token import BossToken
|
||||
from app.services.crawler.boss import BossService
|
||||
from app.services.crawler.qcwy import QcwyService
|
||||
from app.services.crawler.zhilian import ZhilianService
|
||||
|
||||
|
||||
class CompanyCleaner:
|
||||
def __init__(self):
|
||||
self.boss_service = BossService()
|
||||
self.qcwy_service = QcwyService()
|
||||
self.zhilian_service = ZhilianService()
|
||||
self._boss_token_loaded = False
|
||||
|
||||
def _apply_proxy(self, proxy: Optional[str]) -> None:
|
||||
self.boss_service.set_proxy(proxy)
|
||||
self.qcwy_service.set_proxy(proxy)
|
||||
self.zhilian_service.set_proxy(proxy)
|
||||
|
||||
async def _ensure_boss_token_loaded(self) -> None:
|
||||
if self._boss_token_loaded and self.boss_service.login_data.get("mpt"):
|
||||
return
|
||||
token_obj = await BossToken.filter(is_active=True).order_by("-updated_at").first()
|
||||
if not token_obj:
|
||||
logger.warning("BossToken not found or inactive in CompanyCleaner")
|
||||
return
|
||||
self.boss_service.set_login_data(token_obj.mpt or "", "")
|
||||
self._boss_token_loaded = True
|
||||
|
||||
async def collect_pending_companies(self, limit: int = 1000, source: Optional[str] = None):
|
||||
client = await clickhouse_manager.get_client()
|
||||
logger.info(f"Starting to collect pending companies (limit={limit}, source={source or 'all'})...")
|
||||
if source is None or source == "zhilian":
|
||||
await self._collect_zhilian(client, limit)
|
||||
if source is None or source == "qcwy":
|
||||
await self._collect_qcwy(client, limit)
|
||||
if source is None or source == "boss":
|
||||
await self._collect_boss(client, limit)
|
||||
logger.info("Finished collecting pending companies.")
|
||||
|
||||
async def _collect_zhilian(self, client, limit: int):
|
||||
logger.info("Collecting Zhilian companies...")
|
||||
# 优化:先获取已存在的公司ID,避免在子查询中读取json_data
|
||||
# 使用PREWHERE提前过滤时间范围,减少需要读取的数据量
|
||||
# 检查90天内已处理的公司,避免重复请求
|
||||
days_back_existing = 90 # 查询最近90天的数据,避免重复请求已处理过的公司
|
||||
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
|
||||
FROM job_data.zhilian_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT 50000
|
||||
"""
|
||||
|
||||
# 添加重试机制
|
||||
existing_result = None
|
||||
existing_cids = set() # 默认使用空集合
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.info(f"Querying existing Zhilian companies (attempt {attempt+1})...")
|
||||
existing_result = await client.query(existing_companies_query)
|
||||
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
|
||||
break
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt == 0:
|
||||
days_back_existing = 1
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
|
||||
FROM job_data.zhilian_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT 5000
|
||||
"""
|
||||
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
|
||||
elif attempt == 1:
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyNumber') as cid
|
||||
FROM job_data.zhilian_company SAMPLE 0.1
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT 2000
|
||||
"""
|
||||
logger.warning(f"Memory error persists, using SAMPLE 0.1")
|
||||
else:
|
||||
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
|
||||
logger.warning("Using empty set for existing_cids, continuing with collection...")
|
||||
existing_cids = set()
|
||||
break
|
||||
else:
|
||||
logger.error(f"Non-memory error while querying existing companies: {e}")
|
||||
raise
|
||||
|
||||
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'zhilian'"
|
||||
pending_result = await client.query(pending_query)
|
||||
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
|
||||
|
||||
# 构建排除列表
|
||||
exclude_cids = existing_cids | pending_cids
|
||||
|
||||
# 优化:添加时间范围过滤,只查询最近30天的数据,减少扫描量
|
||||
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
|
||||
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'companyNumber') as cid,
|
||||
JSONExtractString(json_data, 'companyName') as cname
|
||||
FROM job_data.zhilian_job
|
||||
PREWHERE created_at > now() - INTERVAL 30 DAY
|
||||
WHERE json_data != ''
|
||||
AND JSONExtractString(json_data, 'companyNumber') != ''
|
||||
LIMIT {limit * 2}
|
||||
"""
|
||||
logger.info(f"Executing SQL for Zhilian (limit={limit * 2}): {query[:500]}...")
|
||||
result = await client.query(query)
|
||||
if not result.result_rows:
|
||||
return
|
||||
|
||||
# 在 Python 中过滤掉已存在的和待处理的
|
||||
rows: List[Dict[str, Any]] = []
|
||||
for cid, cname in result.result_rows:
|
||||
if not cid or cid in exclude_cids:
|
||||
continue
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
rows.append(
|
||||
{
|
||||
"source": "zhilian",
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": "pending",
|
||||
"created_at": datetime.now(),
|
||||
"updated_at": datetime.now(),
|
||||
}
|
||||
)
|
||||
await self._insert_pending(client, rows)
|
||||
logger.info(f"Added {len(rows)} Zhilian companies to pending.")
|
||||
|
||||
async def _collect_qcwy(self, client, limit: int):
|
||||
logger.info("Collecting QCWY companies...")
|
||||
# 优化:先获取已存在的公司ID,避免在子查询中读取json_data
|
||||
# 使用PREWHERE提前过滤时间范围,减少需要读取的数据量
|
||||
# 检查90天内已处理的公司,避免重复请求
|
||||
days_back_existing = 90 # 查询最近90天的数据,避免重复请求已处理过的公司
|
||||
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
|
||||
FROM job_data.qcwy_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyId') != ''
|
||||
LIMIT 50000
|
||||
"""
|
||||
|
||||
# 添加重试机制
|
||||
existing_result = None
|
||||
existing_cids = set() # 默认使用空集合
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.info(f"Querying existing QCWY companies (attempt {attempt+1})...")
|
||||
existing_result = await client.query(existing_companies_query)
|
||||
# 查询成功,提取结果
|
||||
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
|
||||
break
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt == 0:
|
||||
# 第一次失败:进一步减少时间范围
|
||||
days_back_existing = 1
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
|
||||
FROM job_data.qcwy_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyId') != ''
|
||||
LIMIT 5000
|
||||
"""
|
||||
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
|
||||
elif attempt == 1:
|
||||
# 第二次失败:使用采样
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'companyId') as cid
|
||||
FROM job_data.qcwy_company SAMPLE 0.1
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'companyId') != ''
|
||||
LIMIT 2000
|
||||
"""
|
||||
logger.warning(f"Memory error persists, using SAMPLE 0.1")
|
||||
else:
|
||||
# 最后一次尝试也失败,使用空集合继续执行(避免阻塞整个流程)
|
||||
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
|
||||
logger.warning("Using empty set for existing_cids, continuing with collection...")
|
||||
existing_cids = set()
|
||||
break
|
||||
else:
|
||||
# 其他错误直接抛出
|
||||
logger.error(f"Non-memory error while querying existing companies: {e}")
|
||||
raise
|
||||
|
||||
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'qcwy'"
|
||||
pending_result = await client.query(pending_query)
|
||||
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
|
||||
|
||||
# 构建排除列表
|
||||
exclude_cids = existing_cids | pending_cids
|
||||
|
||||
# 优化策略:
|
||||
# 1. 减少时间范围:从30天减少到7天,大幅减少扫描的数据量
|
||||
# 2. 减少LIMIT:从limit*2减少到更小的值,减少内存占用
|
||||
# 3. 使用更严格的PREWHERE条件:先过滤时间,再过滤空json_data和超大JSON
|
||||
# 4. 限制JSON大小:过滤掉过大的json_data(可能包含大量嵌套数据)
|
||||
# 5. 分批查询:如果limit较大,分批处理,每次查询更少的数据
|
||||
|
||||
days_back = 7 # 从30天减少到7天,减少扫描量
|
||||
# 注意:不使用length(json_data)检查,因为它需要读取整个列来计算长度
|
||||
query_limit = min(limit * 2, 100) # 限制最大查询数量,避免内存超限
|
||||
|
||||
# 分批查询策略:如果limit较大,分批处理
|
||||
result = None
|
||||
for attempt in range(3): # 最多尝试3次
|
||||
try:
|
||||
# 根据尝试次数调整参数
|
||||
if attempt == 1:
|
||||
# 第一次失败后:减少时间范围到3天
|
||||
days_back = 3
|
||||
query_limit = min(query_limit, 50)
|
||||
logger.warning(f"Retry {attempt}: Reducing time range to {days_back} days and limit to {query_limit}")
|
||||
elif attempt == 2:
|
||||
# 第二次失败后:使用采样
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'coId') as cid,
|
||||
JSONExtractString(json_data, 'companyName') as cname
|
||||
FROM job_data.qcwy_job SAMPLE 0.1
|
||||
PREWHERE created_at > now() - INTERVAL {days_back} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'coId') != ''
|
||||
LIMIT {query_limit}
|
||||
"""
|
||||
logger.warning(f"Retry {attempt}: Using SAMPLE 0.1 to reduce memory usage")
|
||||
result = await client.query(query)
|
||||
break
|
||||
|
||||
# 正常查询或第一次重试
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'coId') as cid,
|
||||
JSONExtractString(json_data, 'companyName') as cname
|
||||
FROM job_data.qcwy_job
|
||||
PREWHERE created_at > now() - INTERVAL {days_back} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'coId') != ''
|
||||
LIMIT {query_limit}
|
||||
"""
|
||||
|
||||
logger.info(f"Executing SQL for QCWY (limit={query_limit}, days={days_back}, attempt={attempt+1}): {query[:400]}...")
|
||||
result = await client.query(query)
|
||||
break
|
||||
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
# 如果查询失败(可能是内存超限),继续重试
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt < 2:
|
||||
logger.warning(f"Memory error on attempt {attempt+1}: {e}")
|
||||
continue
|
||||
else:
|
||||
# 最后一次尝试也失败,抛出异常
|
||||
logger.error(f"Query failed after {attempt+1} attempts: {e}")
|
||||
raise
|
||||
else:
|
||||
# 其他错误直接抛出
|
||||
logger.error(f"Query failed with non-memory error: {e}")
|
||||
raise
|
||||
|
||||
if not result or not result.result_rows:
|
||||
logger.info("No QCWY companies found in query result.")
|
||||
return
|
||||
|
||||
# 在 Python 中过滤掉已存在的和待处理的
|
||||
rows: List[Dict[str, Any]] = []
|
||||
for cid, cname in result.result_rows:
|
||||
if not cid or cid in exclude_cids:
|
||||
continue
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
rows.append(
|
||||
{
|
||||
"source": "qcwy",
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": "pending",
|
||||
"created_at": datetime.now(),
|
||||
"updated_at": datetime.now(),
|
||||
}
|
||||
)
|
||||
|
||||
if rows:
|
||||
await self._insert_pending(client, rows)
|
||||
logger.info(f"Added {len(rows)} QCWY companies to pending.")
|
||||
else:
|
||||
logger.info("No new QCWY companies found after filtering.")
|
||||
|
||||
async def _collect_boss(self, client, limit: int):
|
||||
logger.info("Collecting Boss companies...")
|
||||
# 优化:先获取已存在的公司ID,避免在子查询中读取json_data
|
||||
# 使用PREWHERE提前过滤时间范围,减少需要读取的数据量
|
||||
# 检查90天内已处理的公司,避免重复请求
|
||||
days_back_existing = 90 # 查询最近90天的数据,避免重复请求已处理过的公司
|
||||
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
|
||||
FROM job_data.boss_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'brandId') != ''
|
||||
LIMIT 50000
|
||||
"""
|
||||
|
||||
# 添加重试机制
|
||||
existing_result = None
|
||||
existing_cids = set() # 默认使用空集合
|
||||
for attempt in range(3):
|
||||
try:
|
||||
logger.info(f"Querying existing Boss companies (attempt {attempt+1})...")
|
||||
existing_result = await client.query(existing_companies_query)
|
||||
existing_cids = {row[0] for row in existing_result.result_rows if row[0]}
|
||||
break
|
||||
except Exception as e:
|
||||
error_str = str(e).lower()
|
||||
if "memory" in error_str or "memory_limit" in error_str:
|
||||
if attempt == 0:
|
||||
days_back_existing = 1
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
|
||||
FROM job_data.boss_company
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'brandId') != ''
|
||||
"""
|
||||
logger.warning(f"Memory error, reducing time range to {days_back_existing} days")
|
||||
elif attempt == 1:
|
||||
existing_companies_query = f"""
|
||||
SELECT DISTINCT JSONExtractString(json_data, 'brandId') as cid
|
||||
FROM job_data.boss_company SAMPLE 0.1
|
||||
PREWHERE updated_at > now() - INTERVAL {days_back_existing} DAY
|
||||
AND json_data != ''
|
||||
WHERE JSONExtractString(json_data, 'brandId') != ''
|
||||
"""
|
||||
logger.warning(f"Memory error persists, using SAMPLE 0.1")
|
||||
else:
|
||||
logger.error(f"Failed to query existing companies after {attempt+1} attempts: {e}")
|
||||
logger.warning("Using empty set for existing_cids, continuing with collection...")
|
||||
existing_cids = set()
|
||||
break
|
||||
else:
|
||||
logger.error(f"Non-memory error while querying existing companies: {e}")
|
||||
raise
|
||||
|
||||
pending_query = "SELECT DISTINCT company_id FROM job_data.pending_company WHERE source = 'boss'"
|
||||
pending_result = await client.query(pending_query)
|
||||
pending_cids = {row[0] for row in pending_result.result_rows if row[0]}
|
||||
|
||||
# 构建排除列表
|
||||
exclude_cids = existing_cids | pending_cids
|
||||
|
||||
# 优化:添加时间范围过滤,只查询最近30天的数据,减少扫描量
|
||||
# 使用 PREWHERE 提前过滤时间范围,避免读取大量历史数据的 json_data
|
||||
# 增加 LIMIT 以便在 Python 中过滤后仍有足够的数据
|
||||
query = f"""
|
||||
SELECT DISTINCT
|
||||
JSONExtractString(json_data, 'brandId') as cid,
|
||||
JSONExtractString(json_data, 'brandName') as cname
|
||||
FROM job_data.boss_job
|
||||
PREWHERE created_at > now() - INTERVAL 30 DAY
|
||||
WHERE json_data != ''
|
||||
AND JSONExtractString(json_data, 'brandId') != ''
|
||||
LIMIT {limit * 2}
|
||||
"""
|
||||
logger.info(f"Executing SQL for Boss (limit={limit * 2}): {query[:500]}...")
|
||||
result = await client.query(query)
|
||||
if not result.result_rows:
|
||||
return
|
||||
|
||||
# 在 Python 中过滤掉已存在的和待处理的
|
||||
rows: List[Dict[str, Any]] = []
|
||||
for cid, cname in result.result_rows:
|
||||
if not cid or cid in exclude_cids:
|
||||
continue
|
||||
if len(rows) >= limit:
|
||||
break
|
||||
rows.append(
|
||||
{
|
||||
"source": "boss",
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": "pending",
|
||||
"created_at": datetime.now(),
|
||||
"updated_at": datetime.now(),
|
||||
}
|
||||
)
|
||||
await self._insert_pending(client, rows)
|
||||
logger.info(f"Added {len(rows)} Boss companies to pending.")
|
||||
|
||||
async def _insert_pending(self, client, rows: List[Dict[str, Any]]):
|
||||
if not rows:
|
||||
return
|
||||
data: List[List[Any]] = []
|
||||
for r in rows:
|
||||
data.append(
|
||||
[
|
||||
r["source"],
|
||||
r["company_id"],
|
||||
r["company_name"],
|
||||
r["status"],
|
||||
"",
|
||||
r["created_at"],
|
||||
r["updated_at"],
|
||||
1,
|
||||
]
|
||||
)
|
||||
await client.insert(
|
||||
"job_data.pending_company",
|
||||
data,
|
||||
column_names=[
|
||||
"source",
|
||||
"company_id",
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"version",
|
||||
],
|
||||
)
|
||||
|
||||
async def process_single_company(
|
||||
self,
|
||||
source: str,
|
||||
company_id: str,
|
||||
proxy: Optional[str] = None,
|
||||
max_delay_seconds: int = 5,
|
||||
) -> Dict[str, Any]:
|
||||
client = await clickhouse_manager.get_client()
|
||||
if proxy:
|
||||
self._apply_proxy(proxy)
|
||||
delay = 0
|
||||
if max_delay_seconds and max_delay_seconds > 0:
|
||||
delay = random.randint(1, max_delay_seconds)
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
query = f"""
|
||||
SELECT source, company_id, company_name, version
|
||||
FROM job_data.pending_company
|
||||
FINAL
|
||||
WHERE source = '{source}' AND company_id = '{company_id}'
|
||||
ORDER BY version DESC
|
||||
LIMIT 1
|
||||
"""
|
||||
result = await client.query(query)
|
||||
if result.result_rows:
|
||||
source_value, cid, cname, version = result.result_rows[0]
|
||||
else:
|
||||
source_value = source
|
||||
cid = company_id
|
||||
cname = ""
|
||||
version = 0
|
||||
try:
|
||||
success = await self._fetch_and_save(source_value, cid)
|
||||
status = "done" if success else "failed"
|
||||
error_msg = "" if success else "Fetch failed"
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {source_value} {cid}: {e}")
|
||||
status = "failed"
|
||||
error_msg = str(e)
|
||||
await client.insert(
|
||||
"job_data.pending_company",
|
||||
[
|
||||
[
|
||||
source_value,
|
||||
cid,
|
||||
cname,
|
||||
status,
|
||||
error_msg.replace("'", "''"),
|
||||
datetime.now(),
|
||||
datetime.now(),
|
||||
int(version) + 1,
|
||||
]
|
||||
],
|
||||
column_names=[
|
||||
"source",
|
||||
"company_id",
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"version",
|
||||
],
|
||||
)
|
||||
return {
|
||||
"success": status == "done",
|
||||
"source": source_value,
|
||||
"company_id": cid,
|
||||
"company_name": cname,
|
||||
"status": status,
|
||||
"error_msg": error_msg,
|
||||
"version": int(version) + 1,
|
||||
}
|
||||
|
||||
async def process_pending_companies(
|
||||
self,
|
||||
limit: int = 100,
|
||||
source: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
max_delay_seconds: int = 0,
|
||||
):
|
||||
client = await clickhouse_manager.get_client()
|
||||
logger.info(f"Processing pending companies (limit={limit}, source={source or 'all'})...")
|
||||
if proxy:
|
||||
self._apply_proxy(proxy)
|
||||
where_clause = "WHERE status = 'pending'"
|
||||
if source:
|
||||
where_clause += f" AND source = '{source}'"
|
||||
query = f"""
|
||||
SELECT source, company_id, company_name, version
|
||||
FROM job_data.pending_company
|
||||
FINAL
|
||||
{where_clause}
|
||||
ORDER BY created_at ASC
|
||||
LIMIT {limit}
|
||||
"""
|
||||
result = await client.query(query)
|
||||
if not result.result_rows:
|
||||
logger.info("No pending companies to process.")
|
||||
return
|
||||
for source_value, cid, cname, version in result.result_rows:
|
||||
logger.info(f"Processing {source_value} company: {cname} ({cid})")
|
||||
try:
|
||||
if max_delay_seconds and max_delay_seconds > 0:
|
||||
delay = random.randint(1, max_delay_seconds)
|
||||
if delay > 0:
|
||||
await asyncio.sleep(delay)
|
||||
success = await self._fetch_and_save(source_value, cid)
|
||||
status = "done" if success else "failed"
|
||||
error_msg = "" if success else "Fetch failed"
|
||||
except Exception as e:
|
||||
logger.error(f"Error processing {source_value} {cid}: {e}")
|
||||
status = "failed"
|
||||
error_msg = str(e)
|
||||
await client.insert(
|
||||
"job_data.pending_company",
|
||||
[
|
||||
[
|
||||
source_value,
|
||||
cid,
|
||||
cname,
|
||||
status,
|
||||
error_msg.replace("'", "''"),
|
||||
datetime.now(),
|
||||
datetime.now(),
|
||||
int(version) + 1,
|
||||
]
|
||||
],
|
||||
column_names=[
|
||||
"source",
|
||||
"company_id",
|
||||
"company_name",
|
||||
"status",
|
||||
"error_msg",
|
||||
"created_at",
|
||||
"updated_at",
|
||||
"version",
|
||||
],
|
||||
)
|
||||
|
||||
async def _fetch_and_save(self, source: str, company_id: str) -> bool:
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
target_table = ""
|
||||
if source == "zhilian":
|
||||
data = self.zhilian_service.get_company_detail(company_id)
|
||||
target_table = "zhilian_company"
|
||||
elif source == "qcwy":
|
||||
data = self.qcwy_service.get_company_info(company_id)
|
||||
target_table = "qcwy_company"
|
||||
elif source == "boss":
|
||||
await self._ensure_boss_token_loaded()
|
||||
data = self.boss_service.get_company_detail_by_id(company_id)
|
||||
target_table = "boss_company"
|
||||
if not data:
|
||||
logger.error(f"No data returned from source={source} company_id={company_id}")
|
||||
return False
|
||||
try:
|
||||
logger.info(
|
||||
f"Raw company data from source={source} company_id={company_id}: "
|
||||
f"{json.dumps(data, ensure_ascii=False)[:2000]}"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to log raw company data for source={source} company_id={company_id}: {e}")
|
||||
client = await clickhouse_manager.get_client()
|
||||
name = ""
|
||||
if source == "zhilian":
|
||||
name = data.get("companyBase", {}).get("companyName", "")
|
||||
elif source == "qcwy":
|
||||
name = data.get("companyName", "")
|
||||
elif source == "boss":
|
||||
name = data.get("name", "")
|
||||
json_str = json.dumps(data, ensure_ascii=False)
|
||||
await client.insert(
|
||||
f"job_data.{target_table}",
|
||||
[[0, json_str, name, datetime.now(), datetime.now()]],
|
||||
column_names=["id", "json_data", "company_name", "created_at", "updated_at"],
|
||||
)
|
||||
return True
|
||||
|
||||
|
||||
async def cleanup_old_records(self):
|
||||
""" 清理已完成或失败的记录 (每日调用) """
|
||||
client = await clickhouse_manager.get_client()
|
||||
logger.info("Starting cleanup of processed pending companies...")
|
||||
|
||||
# ClickHouse mutations are async, but lightweight for this purpose
|
||||
query = "ALTER TABLE job_data.pending_company DELETE WHERE status IN ('done', 'failed')"
|
||||
try:
|
||||
await client.command(query)
|
||||
logger.info("Cleanup command executed successfully.")
|
||||
except Exception as e:
|
||||
logger.error(f"Cleanup failed: {e}")
|
||||
|
||||
company_cleaner = CompanyCleaner()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user