ip
This commit is contained in:
parent
20cb35fc6e
commit
42280f8bed
@ -59,6 +59,8 @@ async def push_to_remote(data: Dict[str, Any]) -> bool:
|
|||||||
source_type = data.get("source_type", "未知平台")
|
source_type = data.get("source_type", "未知平台")
|
||||||
title = data.get("title", "未知职位")
|
title = data.get("title", "未知职位")
|
||||||
company = data.get("company_name", data.get("name", "未知公司"))
|
company = data.get("company_name", data.get("name", "未知公司"))
|
||||||
|
logger.info(f"上报数据: [{source_type}] {title} - {company}")
|
||||||
|
print(data)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
url = _build_auth_url()
|
url = _build_auth_url()
|
||||||
|
|||||||
40
docker-compose.spider.yml
Normal file
40
docker-compose.spider.yml
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
x-spider-common: &spider-common
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: spider.Dockerfile
|
||||||
|
restart: unless-stopped
|
||||||
|
environment: &spider-env
|
||||||
|
API_BASE_URL: ${API_BASE_URL:-http://124.222.106.226:9999}
|
||||||
|
API_TOKEN: ${API_TOKEN:-dev}
|
||||||
|
SLEEP_MIN_SECONDS: ${SLEEP_MIN_SECONDS:-5}
|
||||||
|
SLEEP_MAX_SECONDS: ${SLEEP_MAX_SECONDS:-12}
|
||||||
|
MAX_PAGES: "100"
|
||||||
|
INLINE_COMPANY: "0"
|
||||||
|
# 代理已内置,设 PROXY_TUNNEL=none 可禁用
|
||||||
|
|
||||||
|
services:
|
||||||
|
# ── Boss直聘 ──
|
||||||
|
spider-boss:
|
||||||
|
<<: *spider-common
|
||||||
|
environment:
|
||||||
|
<<: *spider-env
|
||||||
|
PLATFORM: boss
|
||||||
|
INSTANCES: "3"
|
||||||
|
|
||||||
|
# ── 前程无忧 ──
|
||||||
|
spider-qcwy:
|
||||||
|
<<: *spider-common
|
||||||
|
environment:
|
||||||
|
<<: *spider-env
|
||||||
|
PLATFORM: qcwy
|
||||||
|
INSTANCES: "3"
|
||||||
|
|
||||||
|
# ── 智联招聘 ──
|
||||||
|
spider-zhilian:
|
||||||
|
<<: *spider-common
|
||||||
|
environment:
|
||||||
|
<<: *spider-env
|
||||||
|
PLATFORM: zhilian
|
||||||
|
INSTANCES: "3"
|
||||||
@ -36,3 +36,53 @@ https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&prod
|
|||||||
未获得实例ID,终止
|
未获得实例ID,终止
|
||||||
|
|
||||||
[定时] 开始执行 pipeline:2026-03-22T18:30:00.007107
|
[定时] 开始执行 pipeline:2026-03-22T18:30:00.007107
|
||||||
|
[main] start clearing instances with prefix launch-advisor-20251123
|
||||||
|
当前地域无实例或无匹配实例,无需清理
|
||||||
|
[main] clearing completed
|
||||||
|
[创建] 正在提交创建实例请求
|
||||||
|
InvalidAccountStatus.NotEnoughBalance
|
||||||
|
code: 403, Your account does not have enough balance to order postpaid product. request id: AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
|
||||||
|
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
|
||||||
|
未获得实例ID,终止
|
||||||
|
|
||||||
|
[定时] 开始执行 pipeline:2026-03-23T00:30:00.005997
|
||||||
|
[main] start clearing instances with prefix launch-advisor-20251123
|
||||||
|
当前地域无实例或无匹配实例,无需清理
|
||||||
|
[main] clearing completed
|
||||||
|
[创建] 正在提交创建实例请求
|
||||||
|
InvalidAccountStatus.NotEnoughBalance
|
||||||
|
code: 403, Your account does not have enough balance to order postpaid product. request id: 8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
|
||||||
|
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
|
||||||
|
未获得实例ID,终止
|
||||||
|
|
||||||
|
[定时] 开始执行 pipeline:2026-03-23T06:30:00.006451
|
||||||
|
[main] start clearing instances with prefix launch-advisor-20251123
|
||||||
|
当前地域无实例或无匹配实例,无需清理
|
||||||
|
[main] clearing completed
|
||||||
|
[创建] 正在提交创建实例请求
|
||||||
|
InvalidAccountStatus.NotEnoughBalance
|
||||||
|
code: 403, Your account does not have enough balance to order postpaid product. request id: 60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
|
||||||
|
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
|
||||||
|
未获得实例ID,终止
|
||||||
|
|
||||||
|
[定时] 开始执行 pipeline:2026-03-23T12:30:00.003733
|
||||||
|
[main] start clearing instances with prefix launch-advisor-20251123
|
||||||
|
当前地域无实例或无匹配实例,无需清理
|
||||||
|
[main] clearing completed
|
||||||
|
[创建] 正在提交创建实例请求
|
||||||
|
InvalidAccountStatus.NotEnoughBalance
|
||||||
|
code: 403, Your account does not have enough balance to order postpaid product. request id: 79E1B24A-4D5A-533D-97CB-71616BB5336B
|
||||||
|
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=79E1B24A-4D5A-533D-97CB-71616BB5336B
|
||||||
|
未获得实例ID,终止
|
||||||
|
|
||||||
|
[定时] 开始执行 pipeline:2026-03-23T18:30:00.002226
|
||||||
|
[main] start clearing instances with prefix launch-advisor-20251123
|
||||||
|
当前地域无实例或无匹配实例,无需清理
|
||||||
|
[main] clearing completed
|
||||||
|
[创建] 正在提交创建实例请求
|
||||||
|
InvalidAccountStatus.NotEnoughBalance
|
||||||
|
code: 403, Your account does not have enough balance to order postpaid product. request id: 309194E6-612F-5921-96A1-C6620E0BE99B
|
||||||
|
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=309194E6-612F-5921-96A1-C6620E0BE99B
|
||||||
|
未获得实例ID,终止
|
||||||
|
|
||||||
|
[定时] 开始执行 pipeline:2026-03-24T00:30:00.004105
|
||||||
|
|||||||
55
spider-entrypoint.sh
Normal file
55
spider-entrypoint.sh
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
PLATFORM="${PLATFORM:-boss}"
|
||||||
|
INSTANCES="${INSTANCES:-1}"
|
||||||
|
|
||||||
|
echo "=========================================="
|
||||||
|
echo " Spider Container"
|
||||||
|
echo " Platform: ${PLATFORM}"
|
||||||
|
echo " Instances: ${INSTANCES}"
|
||||||
|
echo " API: ${API_BASE_URL}"
|
||||||
|
echo " Delay: ${SLEEP_MIN_SECONDS}-${SLEEP_MAX_SECONDS}s"
|
||||||
|
echo " Inline Co: ${INLINE_COMPANY}"
|
||||||
|
echo "=========================================="
|
||||||
|
|
||||||
|
# 平台 -> Python 模块映射
|
||||||
|
case "${PLATFORM}" in
|
||||||
|
boss) MODULE="spiderJobs.platforms.boss.main" ;;
|
||||||
|
qcwy) MODULE="spiderJobs.platforms.job51.main" ;;
|
||||||
|
zhilian) MODULE="spiderJobs.platforms.zhilian.main" ;;
|
||||||
|
*)
|
||||||
|
echo "[ERROR] Unknown PLATFORM: ${PLATFORM}"
|
||||||
|
echo " Supported: boss, qcwy, zhilian"
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
|
||||||
|
# 启动多个实例(最后一个前台运行)
|
||||||
|
PIDS=()
|
||||||
|
for i in $(seq 1 $((INSTANCES - 1))); do
|
||||||
|
echo "[spider] Starting ${PLATFORM} instance ${i}/${INSTANCES} (background)..."
|
||||||
|
python -m "${MODULE}" &
|
||||||
|
PIDS+=($!)
|
||||||
|
sleep 2 # 错开启动,避免同时抢同一个关键词
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "[spider] Starting ${PLATFORM} instance ${INSTANCES}/${INSTANCES} (foreground)..."
|
||||||
|
python -m "${MODULE}" &
|
||||||
|
PIDS+=($!)
|
||||||
|
|
||||||
|
# 等待所有子进程,任一退出则全部退出
|
||||||
|
wait_and_cleanup() {
|
||||||
|
echo "[spider] Shutting down all instances..."
|
||||||
|
for pid in "${PIDS[@]}"; do
|
||||||
|
kill "$pid" 2>/dev/null || true
|
||||||
|
done
|
||||||
|
wait
|
||||||
|
echo "[spider] All instances stopped."
|
||||||
|
}
|
||||||
|
|
||||||
|
trap wait_and_cleanup SIGTERM SIGINT
|
||||||
|
|
||||||
|
# 等任意子进程退出
|
||||||
|
wait -n
|
||||||
|
wait_and_cleanup
|
||||||
41
spider.Dockerfile
Normal file
41
spider.Dockerfile
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
FROM python:3.11-slim-bullseye
|
||||||
|
|
||||||
|
WORKDIR /opt/spider
|
||||||
|
|
||||||
|
# 时区 + 基础工具
|
||||||
|
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||||
|
&& echo "Asia/Shanghai" > /etc/timezone \
|
||||||
|
&& apt-get update \
|
||||||
|
&& apt-get install -y --no-install-recommends curl \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Python 依赖(先复制依赖文件,利用 Docker 缓存)
|
||||||
|
COPY crawler_core/pyproject.toml /opt/spider/crawler_core/pyproject.toml
|
||||||
|
RUN pip install --no-cache-dir \
|
||||||
|
requests_go==1.0.9 \
|
||||||
|
tenacity>=8.0 \
|
||||||
|
requests \
|
||||||
|
PySocks \
|
||||||
|
six \
|
||||||
|
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||||
|
|
||||||
|
# 复制代码
|
||||||
|
COPY crawler_core/ /opt/spider/crawler_core/
|
||||||
|
COPY spiderJobs/ /opt/spider/spiderJobs/
|
||||||
|
|
||||||
|
# 默认环境变量
|
||||||
|
ENV API_BASE_URL=http://124.222.106.226:9999 \
|
||||||
|
API_TOKEN=dev \
|
||||||
|
PLATFORM=boss \
|
||||||
|
SLEEP_MIN_SECONDS=5 \
|
||||||
|
SLEEP_MAX_SECONDS=12 \
|
||||||
|
MAX_PAGES=100 \
|
||||||
|
INLINE_COMPANY=0 \
|
||||||
|
PYTHONUNBUFFERED=1
|
||||||
|
# 代理已内置到代码中(spiderJobs/__init__.py)
|
||||||
|
# 设 PROXY_TUNNEL=none 可禁用代理
|
||||||
|
|
||||||
|
COPY spider-entrypoint.sh /opt/spider/entrypoint.sh
|
||||||
|
RUN chmod +x /opt/spider/entrypoint.sh
|
||||||
|
|
||||||
|
ENTRYPOINT ["/opt/spider/entrypoint.sh"]
|
||||||
@ -0,0 +1,3 @@
|
|||||||
|
# 默认隧道代理(快代理),每次请求自动换 IP
|
||||||
|
# 环境变量 PROXY_TUNNEL 可覆盖;设为 "none" 则禁用代理
|
||||||
|
DEFAULT_TUNNEL_PROXY = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
|
||||||
55
spiderJobs/aaa.txt
Normal file
55
spiderJobs/aaa.txt
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
容器运行正常,成功获取到关键词并开始爬取。
|
||||||
|
|
||||||
|
创建完成,3 个文件
|
||||||
|
|
||||||
|
┌───────────────────────────┬─────────────────────────────────┐
|
||||||
|
│ 文件 │ 用途 │
|
||||||
|
├───────────────────────────┼─────────────────────────────────┤
|
||||||
|
│ spider.Dockerfile │ 爬虫专用镜像,轻量无前端/nginx │
|
||||||
|
├───────────────────────────┼─────────────────────────────────┤
|
||||||
|
│ spider-entrypoint.sh │ 支持单容器多进程(INSTANCES=3) │
|
||||||
|
├───────────────────────────┼─────────────────────────────────┤
|
||||||
|
│ docker-compose.spider.yml │ 一键启动三个平台 │
|
||||||
|
└───────────────────────────┴─────────────────────────────────┘
|
||||||
|
|
||||||
|
使用方式
|
||||||
|
|
||||||
|
单机快速启动(3 个平台各 3 个实例 = 9 个爬虫):
|
||||||
|
docker compose -f docker-compose.spider.yml up -d
|
||||||
|
|
||||||
|
多台电脑部署:
|
||||||
|
# 1. 构建并推送到你的镜像仓库
|
||||||
|
docker build -f spider.Dockerfile -t your-registry/jobdata-spider:latest .
|
||||||
|
docker push your-registry/jobdata-spider:latest
|
||||||
|
|
||||||
|
# 2. 每台电脑上拉取运行
|
||||||
|
docker run -d --name spider-boss \
|
||||||
|
-e PLATFORM=boss \
|
||||||
|
-e INSTANCES=3 \
|
||||||
|
-e API_BASE_URL=http://124.222.106.226:9999 \
|
||||||
|
-e SLEEP_MIN_SECONDS=5 \
|
||||||
|
-e SLEEP_MAX_SECONDS=12 \
|
||||||
|
-e INLINE_COMPANY=0 \
|
||||||
|
your-registry/jobdata-spider:latest
|
||||||
|
|
||||||
|
调参:
|
||||||
|
# 加速:降低延迟,多开实例
|
||||||
|
-e SLEEP_MIN_SECONDS=3 -e SLEEP_MAX_SECONDS=8 -e INSTANCES=5
|
||||||
|
|
||||||
|
# 需要代理
|
||||||
|
-e PROXY_TUNNEL=proxy.example.com:8080 -e PROXY_USERNAME=xxx -e PROXY_PASSWORD=xxx
|
||||||
|
|
||||||
|
# Boss 需要 Token(会自动从后端 API 获取,也可手动指定)
|
||||||
|
-e BOSS_MPT=xxx -e BOSS_WT2=xxx
|
||||||
|
|
||||||
|
默认配置已优化:延迟 5-12 秒(比之前 10-20
|
||||||
|
快一倍),INLINE_COMPANY=0(关掉内联公司爬取)。3 台电脑每台 INSTANCES=3,Boss
|
||||||
|
吞吐量可以从现在的 5 个/小时 → ~270 个/小时。
|
||||||
|
|
||||||
|
✻ Cooked for 2m 16s
|
||||||
|
|
||||||
|
───────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
❯
|
||||||
|
───────────────────────────────────────────────────────────────────────────────────────
|
||||||
|
Opus 4.6 (1M context) │ JobData █░░░░░░░░░ 14%
|
||||||
|
⏵⏵ accept edits on (shift+tab to cycle)
|
||||||
@ -141,15 +141,11 @@ def main():
|
|||||||
signer = BossSign(mpt=mpt, wt2=wt2)
|
signer = BossSign(mpt=mpt, wt2=wt2)
|
||||||
client_kwargs["signer"] = signer
|
client_kwargs["signer"] = signer
|
||||||
|
|
||||||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||||||
if tunnel:
|
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||||||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||||||
username = os.environ.get("PROXY_USERNAME", "")
|
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||||||
password = os.environ.get("PROXY_PASSWORD", "")
|
print(f"[boss] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||||||
if username and password:
|
|
||||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
|
||||||
else:
|
|
||||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
|
||||||
|
|
||||||
run_crawl_loop(
|
run_crawl_loop(
|
||||||
platform="boss",
|
platform="boss",
|
||||||
|
|||||||
@ -90,15 +90,11 @@ def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFet
|
|||||||
def main():
|
def main():
|
||||||
client_kwargs = {}
|
client_kwargs = {}
|
||||||
|
|
||||||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||||||
if tunnel:
|
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||||||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||||||
username = os.environ.get("PROXY_USERNAME", "")
|
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||||||
password = os.environ.get("PROXY_PASSWORD", "")
|
print(f"[qcwy] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||||||
if username and password:
|
|
||||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
|
||||||
else:
|
|
||||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
|
||||||
|
|
||||||
run_crawl_loop(
|
run_crawl_loop(
|
||||||
platform="qcwy",
|
platform="qcwy",
|
||||||
|
|||||||
@ -51,6 +51,7 @@ class ZhilianClient(HTTPClient):
|
|||||||
self,
|
self,
|
||||||
base_url: str = CGATE_BASE_URL,
|
base_url: str = CGATE_BASE_URL,
|
||||||
signer: Optional[ZhilianSign] = None,
|
signer: Optional[ZhilianSign] = None,
|
||||||
|
tunnel_proxy: Optional[str] = None,
|
||||||
proxy: Optional[str] = None,
|
proxy: Optional[str] = None,
|
||||||
proxy_pool: Optional[list[str]] = None,
|
proxy_pool: Optional[list[str]] = None,
|
||||||
timeout: int = 10,
|
timeout: int = 10,
|
||||||
@ -58,6 +59,7 @@ class ZhilianClient(HTTPClient):
|
|||||||
super().__init__(
|
super().__init__(
|
||||||
base_url=base_url,
|
base_url=base_url,
|
||||||
default_headers=ZHILIAN_HEADERS,
|
default_headers=ZHILIAN_HEADERS,
|
||||||
|
tunnel_proxy=tunnel_proxy,
|
||||||
proxy=proxy,
|
proxy=proxy,
|
||||||
proxy_pool=proxy_pool,
|
proxy_pool=proxy_pool,
|
||||||
timeout=timeout,
|
timeout=timeout,
|
||||||
@ -81,18 +83,20 @@ class ZhilianClient(HTTPClient):
|
|||||||
|
|
||||||
def create_cgate_client(
|
def create_cgate_client(
|
||||||
signer: Optional[ZhilianSign] = None,
|
signer: Optional[ZhilianSign] = None,
|
||||||
|
tunnel_proxy: Optional[str] = None,
|
||||||
proxy: Optional[str] = None,
|
proxy: Optional[str] = None,
|
||||||
proxy_pool: Optional[list[str]] = None,
|
proxy_pool: Optional[list[str]] = None,
|
||||||
) -> ZhilianClient:
|
) -> ZhilianClient:
|
||||||
"""创建 cgate 客户端"""
|
"""创建 cgate 客户端"""
|
||||||
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
|
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|
||||||
|
|
||||||
|
|
||||||
def create_capi_client(
|
def create_capi_client(
|
||||||
signer: Optional[ZhilianSign] = None,
|
signer: Optional[ZhilianSign] = None,
|
||||||
|
tunnel_proxy: Optional[str] = None,
|
||||||
proxy: Optional[str] = None,
|
proxy: Optional[str] = None,
|
||||||
proxy_pool: Optional[list[str]] = None,
|
proxy_pool: Optional[list[str]] = None,
|
||||||
) -> ZhilianClient:
|
) -> ZhilianClient:
|
||||||
"""创建 capi 客户端"""
|
"""创建 capi 客户端"""
|
||||||
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
|
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|
||||||
|
|
||||||
|
|||||||
@ -91,9 +91,12 @@ def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseF
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
client_kwargs = {}
|
client_kwargs = {}
|
||||||
proxy = os.environ.get("PROXY_URL", "")
|
|
||||||
if proxy:
|
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||||||
client_kwargs["proxy"] = proxy
|
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||||||
|
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||||||
|
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||||||
|
print(f"[zhilian] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||||||
|
|
||||||
run_crawl_loop(
|
run_crawl_loop(
|
||||||
platform="zhilian",
|
platform="zhilian",
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user