This commit is contained in:
win 2026-03-24 01:49:31 +08:00
parent 20cb35fc6e
commit 42280f8bed
12 changed files with 886 additions and 580 deletions

1175
aaa.json

File diff suppressed because it is too large Load Diff

View File

@ -59,6 +59,8 @@ async def push_to_remote(data: Dict[str, Any]) -> bool:
source_type = data.get("source_type", "未知平台")
title = data.get("title", "未知职位")
company = data.get("company_name", data.get("name", "未知公司"))
logger.info(f"上报数据: [{source_type}] {title} - {company}")
print(data)
try:
url = _build_auth_url()

40
docker-compose.spider.yml Normal file
View File

@ -0,0 +1,40 @@
version: "3.8"
x-spider-common: &spider-common
build:
context: .
dockerfile: spider.Dockerfile
restart: unless-stopped
environment: &spider-env
API_BASE_URL: ${API_BASE_URL:-http://124.222.106.226:9999}
API_TOKEN: ${API_TOKEN:-dev}
SLEEP_MIN_SECONDS: ${SLEEP_MIN_SECONDS:-5}
SLEEP_MAX_SECONDS: ${SLEEP_MAX_SECONDS:-12}
MAX_PAGES: "100"
INLINE_COMPANY: "0"
# 代理已内置,设 PROXY_TUNNEL=none 可禁用
services:
# ── Boss直聘 ──
spider-boss:
<<: *spider-common
environment:
<<: *spider-env
PLATFORM: boss
INSTANCES: "3"
# ── 前程无忧 ──
spider-qcwy:
<<: *spider-common
environment:
<<: *spider-env
PLATFORM: qcwy
INSTANCES: "3"
# ── 智联招聘 ──
spider-zhilian:
<<: *spider-common
environment:
<<: *spider-env
PLATFORM: zhilian
INSTANCES: "3"

View File

@ -36,3 +36,53 @@ https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&prod
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-22T18:30:00.007107
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T00:30:00.005997
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T06:30:00.006451
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T12:30:00.003733
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 79E1B24A-4D5A-533D-97CB-71616BB5336B
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=79E1B24A-4D5A-533D-97CB-71616BB5336B
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-23T18:30:00.002226
[main] start clearing instances with prefix launch-advisor-20251123
当前地域无实例或无匹配实例,无需清理
[main] clearing completed
[创建] 正在提交创建实例请求
InvalidAccountStatus.NotEnoughBalance
code: 403, Your account does not have enough balance to order postpaid product. request id: 309194E6-612F-5921-96A1-C6620E0BE99B
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=309194E6-612F-5921-96A1-C6620E0BE99B
未获得实例ID终止
[定时] 开始执行 pipeline2026-03-24T00:30:00.004105

55
spider-entrypoint.sh Normal file
View File

@ -0,0 +1,55 @@
#!/bin/bash
set -e
PLATFORM="${PLATFORM:-boss}"
INSTANCES="${INSTANCES:-1}"
echo "=========================================="
echo " Spider Container"
echo " Platform: ${PLATFORM}"
echo " Instances: ${INSTANCES}"
echo " API: ${API_BASE_URL}"
echo " Delay: ${SLEEP_MIN_SECONDS}-${SLEEP_MAX_SECONDS}s"
echo " Inline Co: ${INLINE_COMPANY}"
echo "=========================================="
# 平台 -> Python 模块映射
case "${PLATFORM}" in
boss) MODULE="spiderJobs.platforms.boss.main" ;;
qcwy) MODULE="spiderJobs.platforms.job51.main" ;;
zhilian) MODULE="spiderJobs.platforms.zhilian.main" ;;
*)
echo "[ERROR] Unknown PLATFORM: ${PLATFORM}"
echo " Supported: boss, qcwy, zhilian"
exit 1
;;
esac
# 启动多个实例(最后一个前台运行)
PIDS=()
for i in $(seq 1 $((INSTANCES - 1))); do
echo "[spider] Starting ${PLATFORM} instance ${i}/${INSTANCES} (background)..."
python -m "${MODULE}" &
PIDS+=($!)
sleep 2 # 错开启动,避免同时抢同一个关键词
done
echo "[spider] Starting ${PLATFORM} instance ${INSTANCES}/${INSTANCES} (foreground)..."
python -m "${MODULE}" &
PIDS+=($!)
# 等待所有子进程,任一退出则全部退出
wait_and_cleanup() {
echo "[spider] Shutting down all instances..."
for pid in "${PIDS[@]}"; do
kill "$pid" 2>/dev/null || true
done
wait
echo "[spider] All instances stopped."
}
trap wait_and_cleanup SIGTERM SIGINT
# 等任意子进程退出
wait -n
wait_and_cleanup

41
spider.Dockerfile Normal file
View File

@ -0,0 +1,41 @@
FROM python:3.11-slim-bullseye
WORKDIR /opt/spider
# 时区 + 基础工具
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
&& echo "Asia/Shanghai" > /etc/timezone \
&& apt-get update \
&& apt-get install -y --no-install-recommends curl \
&& rm -rf /var/lib/apt/lists/*
# Python 依赖(先复制依赖文件,利用 Docker 缓存)
COPY crawler_core/pyproject.toml /opt/spider/crawler_core/pyproject.toml
RUN pip install --no-cache-dir \
requests_go==1.0.9 \
tenacity>=8.0 \
requests \
PySocks \
six \
-i https://pypi.tuna.tsinghua.edu.cn/simple
# 复制代码
COPY crawler_core/ /opt/spider/crawler_core/
COPY spiderJobs/ /opt/spider/spiderJobs/
# 默认环境变量
ENV API_BASE_URL=http://124.222.106.226:9999 \
API_TOKEN=dev \
PLATFORM=boss \
SLEEP_MIN_SECONDS=5 \
SLEEP_MAX_SECONDS=12 \
MAX_PAGES=100 \
INLINE_COMPANY=0 \
PYTHONUNBUFFERED=1
# 代理已内置到代码中spiderJobs/__init__.py
# 设 PROXY_TUNNEL=none 可禁用代理
COPY spider-entrypoint.sh /opt/spider/entrypoint.sh
RUN chmod +x /opt/spider/entrypoint.sh
ENTRYPOINT ["/opt/spider/entrypoint.sh"]

View File

@ -0,0 +1,3 @@
# 默认隧道代理(快代理),每次请求自动换 IP
# 环境变量 PROXY_TUNNEL 可覆盖;设为 "none" 则禁用代理
DEFAULT_TUNNEL_PROXY = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"

55
spiderJobs/aaa.txt Normal file
View File

@ -0,0 +1,55 @@
容器运行正常,成功获取到关键词并开始爬取。
创建完成3 个文件
┌───────────────────────────┬─────────────────────────────────┐
│ 文件 │ 用途 │
├───────────────────────────┼─────────────────────────────────┤
│ spider.Dockerfile │ 爬虫专用镜像,轻量无前端/nginx │
├───────────────────────────┼─────────────────────────────────┤
│ spider-entrypoint.sh │ 支持单容器多进程INSTANCES=3
├───────────────────────────┼─────────────────────────────────┤
│ docker-compose.spider.yml │ 一键启动三个平台 │
└───────────────────────────┴─────────────────────────────────┘
使用方式
单机快速启动3 个平台各 3 个实例 = 9 个爬虫):
docker compose -f docker-compose.spider.yml up -d
多台电脑部署:
# 1. 构建并推送到你的镜像仓库
docker build -f spider.Dockerfile -t your-registry/jobdata-spider:latest .
docker push your-registry/jobdata-spider:latest
# 2. 每台电脑上拉取运行
docker run -d --name spider-boss \
-e PLATFORM=boss \
-e INSTANCES=3 \
-e API_BASE_URL=http://124.222.106.226:9999 \
-e SLEEP_MIN_SECONDS=5 \
-e SLEEP_MAX_SECONDS=12 \
-e INLINE_COMPANY=0 \
your-registry/jobdata-spider:latest
调参:
# 加速:降低延迟,多开实例
-e SLEEP_MIN_SECONDS=3 -e SLEEP_MAX_SECONDS=8 -e INSTANCES=5
# 需要代理
-e PROXY_TUNNEL=proxy.example.com:8080 -e PROXY_USERNAME=xxx -e PROXY_PASSWORD=xxx
# Boss 需要 Token会自动从后端 API 获取,也可手动指定)
-e BOSS_MPT=xxx -e BOSS_WT2=xxx
默认配置已优化:延迟 5-12 秒(比之前 10-20
快一倍INLINE_COMPANY=0关掉内联公司爬取。3 台电脑每台 INSTANCES=3Boss
吞吐量可以从现在的 5 个/小时 → ~270 个/小时。
✻ Cooked for 2m 16s
───────────────────────────────────────────────────────────────────────────────────────
 
───────────────────────────────────────────────────────────────────────────────────────
Opus 4.6 (1M context) │ JobData █░░░░░░░░░ 14%
⏵⏵ accept edits on (shift+tab to cycle)

View File

@ -141,15 +141,11 @@ def main():
signer = BossSign(mpt=mpt, wt2=wt2)
client_kwargs["signer"] = signer
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[boss] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
run_crawl_loop(
platform="boss",

View File

@ -90,15 +90,11 @@ def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFet
def main():
client_kwargs = {}
tunnel = os.environ.get("PROXY_TUNNEL", "")
if tunnel:
scheme = os.environ.get("PROXY_SCHEME", "http")
username = os.environ.get("PROXY_USERNAME", "")
password = os.environ.get("PROXY_PASSWORD", "")
if username and password:
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
else:
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[qcwy] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
run_crawl_loop(
platform="qcwy",

View File

@ -51,6 +51,7 @@ class ZhilianClient(HTTPClient):
self,
base_url: str = CGATE_BASE_URL,
signer: Optional[ZhilianSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
timeout: int = 10,
@ -58,6 +59,7 @@ class ZhilianClient(HTTPClient):
super().__init__(
base_url=base_url,
default_headers=ZHILIAN_HEADERS,
tunnel_proxy=tunnel_proxy,
proxy=proxy,
proxy_pool=proxy_pool,
timeout=timeout,
@ -81,18 +83,20 @@ class ZhilianClient(HTTPClient):
def create_cgate_client(
signer: Optional[ZhilianSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> ZhilianClient:
"""创建 cgate 客户端"""
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
def create_capi_client(
signer: Optional[ZhilianSign] = None,
tunnel_proxy: Optional[str] = None,
proxy: Optional[str] = None,
proxy_pool: Optional[list[str]] = None,
) -> ZhilianClient:
"""创建 capi 客户端"""
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)

View File

@ -91,9 +91,12 @@ def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseF
def main():
client_kwargs = {}
proxy = os.environ.get("PROXY_URL", "")
if proxy:
client_kwargs["proxy"] = proxy
from spiderJobs import DEFAULT_TUNNEL_PROXY
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
if tunnel_proxy and tunnel_proxy.lower() != "none":
client_kwargs["tunnel_proxy"] = tunnel_proxy
print(f"[zhilian] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
run_crawl_loop(
platform="zhilian",