Compare commits

..

No commits in common. "42280f8bed2b9ad4944181f32aaa0656b44908da" and "78eee99c2fc2c13acdbdb23278f09cc7065cdc92" have entirely different histories.

14 changed files with 598 additions and 935 deletions

1187
aaa.json

File diff suppressed because it is too large Load Diff

View File

@ -179,30 +179,10 @@ async def _get_active_proxy() -> "str | None":
return None
async def _ensure_db_connection():
"""确保 MySQL 连接池可用,不可用时重建"""
from tortoise import Tortoise
from app.settings import TORTOISE_ORM
try:
conn = Tortoise.get_connection("default")
await conn.execute_query("SELECT 1")
except Exception as e:
logger.warning(f"MySQL 连接池不可用({e}),尝试重建...")
try:
await Tortoise.close_connections()
except Exception:
pass
await Tortoise.init(config=TORTOISE_ORM)
logger.info("MySQL 连接池重建成功")
async def company_cleaning_job():
"""每5分钟执行自动清洗待处理公司数据"""
from app.services.company_cleaner import company_cleaner
await _ensure_db_connection()
task_id = str(uuid.uuid4())
started_at = datetime.now()
task_name = "company_cleaning_job"

View File

@ -62,17 +62,17 @@ async def push_to_remote(data: Dict[str, Any]) -> bool:
logger.info(f"上报数据: [{source_type}] {title} - {company}")
print(data)
try:
url = _build_auth_url()
client = get_http_client()
response = await client.post(url, json=data, headers=_PUSH_HEADERS)
if response.status_code == 200:
return True
logger.error(f"数据发送失败: {response.status_code} - {response.text[:100]}")
return False
except Exception as e:
logger.error(f"发送异常: {e}")
return False
# try:
# url = _build_auth_url()
# client = get_http_client()
# response = await client.post(url, json=data, headers=_PUSH_HEADERS)
# if response.status_code == 200:
# return True
# logger.error(f"数据发送失败: {response.status_code} - {response.text[:100]}")
# return False
# except Exception as e:
# logger.error(f"发送异常: {e}")
# return False
async def batch_push_to_remote(data_list: List[Dict[str, Any]]) -> None:

View File

@ -49,20 +49,7 @@ class Settings(BaseSettings):
ALERT_WINDOW_MINUTES: int = 10
TORTOISE_ORM: dict = {
"connections": {
"default": {
"engine": "tortoise.backends.mysql",
"credentials": {
"host": "121.4.126.241",
"port": 3306,
"user": "root",
"password": "jobdata123",
"database": "job_data",
"minsize": 1,
"maxsize": 10,
"connect_timeout": 10,
"charset": "utf8mb4",
},
},
"default": "mysql://root:jobdata123@121.4.126.241:3306/job_data"
},
"apps": {
"models": {

View File

@ -1,40 +0,0 @@
version: "3.8"
x-spider-common: &spider-common
build:
context: .
dockerfile: spider.Dockerfile
restart: unless-stopped
environment: &spider-env
API_BASE_URL: ${API_BASE_URL:-http://124.222.106.226:9999}
API_TOKEN: ${API_TOKEN:-dev}
SLEEP_MIN_SECONDS: ${SLEEP_MIN_SECONDS:-5}
SLEEP_MAX_SECONDS: ${SLEEP_MAX_SECONDS:-12}
MAX_PAGES: "100"
INLINE_COMPANY: "0"
# 代理已内置,设 PROXY_TUNNEL=none 可禁用
services:
# ── Boss直聘 ──
spider-boss:
<<: *spider-common
environment:
<<: *spider-env
PLATFORM: boss
INSTANCES: "3"
# ── 前程无忧 ──
spider-qcwy:
<<: *spider-common
environment:
<<: *spider-env
PLATFORM: qcwy
INSTANCES: "3"
# ── 智联招聘 ──
spider-zhilian:
<<: *spider-common
environment:
<<: *spider-env
PLATFORM: zhilian
INSTANCES: "3"

View File

@ -36,53 +36,3 @@ https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&prod
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-22T18:30:00.007107
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T00:30:00.005997
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T06:30:00.006451
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T12:30:00.003733
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 79E1B24A-4D5A-533D-97CB-71616BB5336B
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=79E1B24A-4D5A-533D-97CB-71616BB5336B
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T18:30:00.002226
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 309194E6-612F-5921-96A1-C6620E0BE99B
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=309194E6-612F-5921-96A1-C6620E0BE99B
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-24T00:30:00.004105

View File

@ -1,55 +0,0 @@
#!/bin/bash
set -e
PLATFORM="${PLATFORM:-boss}"
INSTANCES="${INSTANCES:-1}"
echo "=========================================="
echo " Spider Container"
echo " Platform: ${PLATFORM}"
echo " Instances: ${INSTANCES}"
echo " API: ${API_BASE_URL}"
echo " Delay: ${SLEEP_MIN_SECONDS}-${SLEEP_MAX_SECONDS}s"
echo " Inline Co: ${INLINE_COMPANY}"
echo "=========================================="
# 平台 -> Python 模块映射
case "${PLATFORM}" in
boss) MODULE="spiderJobs.platforms.boss.main" ;;
qcwy) MODULE="spiderJobs.platforms.job51.main" ;;
zhilian) MODULE="spiderJobs.platforms.zhilian.main" ;;
*)
echo "[ERROR] Unknown PLATFORM: ${PLATFORM}"
echo " Supported: boss, qcwy, zhilian"
exit 1
;;
esac
# 启动多个实例(最后一个前台运行)
PIDS=()
for i in $(seq 1 $((INSTANCES - 1))); do
echo "[spider] Starting ${PLATFORM} instance ${i}/${INSTANCES} (background)..."
python -m "${MODULE}" &
PIDS+=($!)
sleep 2 # 错开启动,避免同时抢同一个关键词
done
echo "[spider] Starting ${PLATFORM} instance ${INSTANCES}/${INSTANCES} (foreground)..."
python -m "${MODULE}" &
PIDS+=($!)
# 等待所有子进程,任一退出则全部退出
wait_and_cleanup() {
echo "[spider] Shutting down all instances..."
for pid in "${PIDS[@]}"; do
kill "$pid" 2>/dev/null || true
done
wait
echo "[spider] All instances stopped."
}
trap wait_and_cleanup SIGTERM SIGINT
# 等任意子进程退出
wait -n
wait_and_cleanup

View File

@ -1,41 +0,0 @@
FROM python:3.11-slim-bullseye
WORKDIR /opt/spider
# 时区 + 基础工具
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& echo "Asia/Shanghai" > /etc/timezone \
&& apt-get update \
&& apt-get install -y --no-install-recommends curl \
&& rm -rf /var/lib/apt/lists/*
# Python 依赖(先复制依赖文件,利用 Docker 缓存)
COPY crawler_core/pyproject.toml /opt/spider/crawler_core/pyproject.toml
RUN pip install --no-cache-dir \
requests_go==1.0.9 \
tenacity>=8.0 \
requests \
PySocks \
six \
-i https://pypi.tuna.tsinghua.edu.cn/simple
# 复制代码
COPY crawler_core/ /opt/spider/crawler_core/
COPY spiderJobs/ /opt/spider/spiderJobs/
# 默认环境变量
ENV API_BASE_URL=http://124.222.106.226:9999 \
API_TOKEN=dev \
PLATFORM=boss \
SLEEP_MIN_SECONDS=5 \
SLEEP_MAX_SECONDS=12 \
MAX_PAGES=100 \
INLINE_COMPANY=0 \
PYTHONUNBUFFERED=1
# 代理已内置到代码中spiderJobs/__init__.py
# 设 PROXY_TUNNEL=none 可禁用代理
COPY spider-entrypoint.sh /opt/spider/entrypoint.sh
RUN chmod +x /opt/spider/entrypoint.sh
ENTRYPOINT ["/opt/spider/entrypoint.sh"]

View File

@ -1,3 +0,0 @@
# 默认隧道代理(快代理),每次请求自动换 IP
# 环境变量 PROXY_TUNNEL 可覆盖;设为 "none" 则禁用代理
DEFAULT_TUNNEL_PROXY = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"

View File

@ -1,55 +0,0 @@
容器运行正常,成功获取到关键词并开始爬取。
创建完成3 个文件
┌───────────────────────────┬─────────────────────────────────┐
│ 文件 │ 用途 │
├───────────────────────────┼─────────────────────────────────┤
│ spider.Dockerfile │ 爬虫专用镜像,轻量无前端/nginx │
├───────────────────────────┼─────────────────────────────────┤
│ spider-entrypoint.sh │ 支持单容器多进程INSTANCES=3
├───────────────────────────┼─────────────────────────────────┤
│ docker-compose.spider.yml │ 一键启动三个平台 │
└───────────────────────────┴─────────────────────────────────┘
使用方式
单机快速启动3 个平台各 3 个实例 = 9 个爬虫):
docker compose -f docker-compose.spider.yml up -d
多台电脑部署:
# 1. 构建并推送到你的镜像仓库
docker build -f spider.Dockerfile -t your-registry/jobdata-spider:latest .
docker push your-registry/jobdata-spider:latest
# 2. 每台电脑上拉取运行
docker run -d --name spider-boss \
-e PLATFORM=boss \
-e INSTANCES=3 \
-e API_BASE_URL=http://124.222.106.226:9999 \
-e SLEEP_MIN_SECONDS=5 \
-e SLEEP_MAX_SECONDS=12 \
-e INLINE_COMPANY=0 \
your-registry/jobdata-spider:latest
调参:
# 加速:降低延迟,多开实例
-e SLEEP_MIN_SECONDS=3 -e SLEEP_MAX_SECONDS=8 -e INSTANCES=5
# 需要代理
-e PROXY_TUNNEL=proxy.example.com:8080 -e PROXY_USERNAME=xxx -e PROXY_PASSWORD=xxx
# Boss 需要 Token会自动从后端 API 获取,也可手动指定)
-e BOSS_MPT=xxx -e BOSS_WT2=xxx
默认配置已优化:延迟 5-12 秒(比之前 10-20
快一倍INLINE_COMPANY=0关掉内联公司爬取。3 台电脑每台 INSTANCES=3Boss
吞吐量可以从现在的 5 个/小时 → ~270 个/小时。
✻ Cooked for 2m 16s
───────────────────────────────────────────────────────────────────────────────────────
 
───────────────────────────────────────────────────────────────────────────────────────
Opus 4.6 (1M context) │ JobData █░░░░░░░░░ 14%
⏵⏵ accept edits on (shift+tab to cycle)

View File

@ -141,11 +141,15 @@ def main():
signer = BossSign(mpt=mpt, wt2=wt2)
client_kwargs["signer"] = signer
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[boss] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_crawl_loop(
platform="boss",

View File

@ -90,11 +90,15 @@ def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFet
def main():
client_kwargs = {}
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[qcwy] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
run_crawl_loop(
platform="qcwy",

View File

@ -51,7 +51,6 @@ class ZhilianClient(HTTPClient):
self,
base_url: str = CGATE_BASE_URL,
signer: Optional[ZhilianSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
@ -59,7 +58,6 @@ class ZhilianClient(HTTPClient):
super().__init__(
base_url=base_url,
default_headers=ZHILIAN_HEADERS,
tunnel_proxy=tunnel_proxy,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
@ -83,20 +81,18 @@ class ZhilianClient(HTTPClient):
def create_cgate_client(
signer: Optional[ZhilianSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> ZhilianClient:
"""创建 cgate 客户端"""
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
def create_capi_client(
signer: Optional[ZhilianSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> ZhilianClient:
"""创建 capi 客户端"""
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)

View File

@ -91,12 +91,9 @@ def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseF
def main():
client_kwargs = {}
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[zhilian] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
proxy = os.environ.get("PROXY_URL", "")
if proxy:
client_kwargs["proxy"] = proxy
run_crawl_loop(
platform="zhilian",