ip
This commit is contained in:
parent
20cb35fc6e
commit
42280f8bed
@ -59,6 +59,8 @@ async def push_to_remote(data: Dict[str, Any]) -> bool:
|
||||
source_type = data.get("source_type", "未知平台")
|
||||
title = data.get("title", "未知职位")
|
||||
company = data.get("company_name", data.get("name", "未知公司"))
|
||||
logger.info(f"上报数据: [{source_type}] {title} - {company}")
|
||||
print(data)
|
||||
|
||||
try:
|
||||
url = _build_auth_url()
|
||||
|
||||
40
docker-compose.spider.yml
Normal file
40
docker-compose.spider.yml
Normal file
@ -0,0 +1,40 @@
|
||||
version: "3.8"
|
||||
|
||||
x-spider-common: &spider-common
|
||||
build:
|
||||
context: .
|
||||
dockerfile: spider.Dockerfile
|
||||
restart: unless-stopped
|
||||
environment: &spider-env
|
||||
API_BASE_URL: ${API_BASE_URL:-http://124.222.106.226:9999}
|
||||
API_TOKEN: ${API_TOKEN:-dev}
|
||||
SLEEP_MIN_SECONDS: ${SLEEP_MIN_SECONDS:-5}
|
||||
SLEEP_MAX_SECONDS: ${SLEEP_MAX_SECONDS:-12}
|
||||
MAX_PAGES: "100"
|
||||
INLINE_COMPANY: "0"
|
||||
# 代理已内置,设 PROXY_TUNNEL=none 可禁用
|
||||
|
||||
services:
|
||||
# ── Boss直聘 ──
|
||||
spider-boss:
|
||||
<<: *spider-common
|
||||
environment:
|
||||
<<: *spider-env
|
||||
PLATFORM: boss
|
||||
INSTANCES: "3"
|
||||
|
||||
# ── 前程无忧 ──
|
||||
spider-qcwy:
|
||||
<<: *spider-common
|
||||
environment:
|
||||
<<: *spider-env
|
||||
PLATFORM: qcwy
|
||||
INSTANCES: "3"
|
||||
|
||||
# ── 智联招聘 ──
|
||||
spider-zhilian:
|
||||
<<: *spider-common
|
||||
environment:
|
||||
<<: *spider-env
|
||||
PLATFORM: zhilian
|
||||
INSTANCES: "3"
|
||||
@ -36,3 +36,53 @@ https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&prod
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-22T18:30:00.007107
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=AFA9356D-151E-5F85-82CC-DA00A0A5D2DF
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-23T00:30:00.005997
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=8831C224-BA9F-5E84-9AA2-8BDCA00C6B21
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-23T06:30:00.006451
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=60394E0C-C0A4-5D7D-8380-49A7D5EC37A5
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-23T12:30:00.003733
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 79E1B24A-4D5A-533D-97CB-71616BB5336B
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=79E1B24A-4D5A-533D-97CB-71616BB5336B
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-23T18:30:00.002226
|
||||
[main] start clearing instances with prefix launch-advisor-20251123
|
||||
当前地域无实例或无匹配实例,无需清理
|
||||
[main] clearing completed
|
||||
[创建] 正在提交创建实例请求
|
||||
InvalidAccountStatus.NotEnoughBalance
|
||||
code: 403, Your account does not have enough balance to order postpaid product. request id: 309194E6-612F-5921-96A1-C6620E0BE99B
|
||||
https://api.aliyun.com/troubleshoot?q=InvalidAccountStatus.NotEnoughBalance&product=Ecs&requestId=309194E6-612F-5921-96A1-C6620E0BE99B
|
||||
未获得实例ID,终止
|
||||
|
||||
[定时] 开始执行 pipeline:2026-03-24T00:30:00.004105
|
||||
|
||||
55
spider-entrypoint.sh
Normal file
55
spider-entrypoint.sh
Normal file
@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
PLATFORM="${PLATFORM:-boss}"
|
||||
INSTANCES="${INSTANCES:-1}"
|
||||
|
||||
echo "=========================================="
|
||||
echo " Spider Container"
|
||||
echo " Platform: ${PLATFORM}"
|
||||
echo " Instances: ${INSTANCES}"
|
||||
echo " API: ${API_BASE_URL}"
|
||||
echo " Delay: ${SLEEP_MIN_SECONDS}-${SLEEP_MAX_SECONDS}s"
|
||||
echo " Inline Co: ${INLINE_COMPANY}"
|
||||
echo "=========================================="
|
||||
|
||||
# 平台 -> Python 模块映射
|
||||
case "${PLATFORM}" in
|
||||
boss) MODULE="spiderJobs.platforms.boss.main" ;;
|
||||
qcwy) MODULE="spiderJobs.platforms.job51.main" ;;
|
||||
zhilian) MODULE="spiderJobs.platforms.zhilian.main" ;;
|
||||
*)
|
||||
echo "[ERROR] Unknown PLATFORM: ${PLATFORM}"
|
||||
echo " Supported: boss, qcwy, zhilian"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
# 启动多个实例(最后一个前台运行)
|
||||
PIDS=()
|
||||
for i in $(seq 1 $((INSTANCES - 1))); do
|
||||
echo "[spider] Starting ${PLATFORM} instance ${i}/${INSTANCES} (background)..."
|
||||
python -m "${MODULE}" &
|
||||
PIDS+=($!)
|
||||
sleep 2 # 错开启动,避免同时抢同一个关键词
|
||||
done
|
||||
|
||||
echo "[spider] Starting ${PLATFORM} instance ${INSTANCES}/${INSTANCES} (foreground)..."
|
||||
python -m "${MODULE}" &
|
||||
PIDS+=($!)
|
||||
|
||||
# 等待所有子进程,任一退出则全部退出
|
||||
wait_and_cleanup() {
|
||||
echo "[spider] Shutting down all instances..."
|
||||
for pid in "${PIDS[@]}"; do
|
||||
kill "$pid" 2>/dev/null || true
|
||||
done
|
||||
wait
|
||||
echo "[spider] All instances stopped."
|
||||
}
|
||||
|
||||
trap wait_and_cleanup SIGTERM SIGINT
|
||||
|
||||
# 等任意子进程退出
|
||||
wait -n
|
||||
wait_and_cleanup
|
||||
41
spider.Dockerfile
Normal file
41
spider.Dockerfile
Normal file
@ -0,0 +1,41 @@
|
||||
FROM python:3.11-slim-bullseye
|
||||
|
||||
WORKDIR /opt/spider
|
||||
|
||||
# 时区 + 基础工具
|
||||
RUN ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
|
||||
&& echo "Asia/Shanghai" > /etc/timezone \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y --no-install-recommends curl \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Python 依赖(先复制依赖文件,利用 Docker 缓存)
|
||||
COPY crawler_core/pyproject.toml /opt/spider/crawler_core/pyproject.toml
|
||||
RUN pip install --no-cache-dir \
|
||||
requests_go==1.0.9 \
|
||||
tenacity>=8.0 \
|
||||
requests \
|
||||
PySocks \
|
||||
six \
|
||||
-i https://pypi.tuna.tsinghua.edu.cn/simple
|
||||
|
||||
# 复制代码
|
||||
COPY crawler_core/ /opt/spider/crawler_core/
|
||||
COPY spiderJobs/ /opt/spider/spiderJobs/
|
||||
|
||||
# 默认环境变量
|
||||
ENV API_BASE_URL=http://124.222.106.226:9999 \
|
||||
API_TOKEN=dev \
|
||||
PLATFORM=boss \
|
||||
SLEEP_MIN_SECONDS=5 \
|
||||
SLEEP_MAX_SECONDS=12 \
|
||||
MAX_PAGES=100 \
|
||||
INLINE_COMPANY=0 \
|
||||
PYTHONUNBUFFERED=1
|
||||
# 代理已内置到代码中(spiderJobs/__init__.py)
|
||||
# 设 PROXY_TUNNEL=none 可禁用代理
|
||||
|
||||
COPY spider-entrypoint.sh /opt/spider/entrypoint.sh
|
||||
RUN chmod +x /opt/spider/entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["/opt/spider/entrypoint.sh"]
|
||||
@ -0,0 +1,3 @@
|
||||
# 默认隧道代理(快代理),每次请求自动换 IP
|
||||
# 环境变量 PROXY_TUNNEL 可覆盖;设为 "none" 则禁用代理
|
||||
DEFAULT_TUNNEL_PROXY = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
|
||||
55
spiderJobs/aaa.txt
Normal file
55
spiderJobs/aaa.txt
Normal file
@ -0,0 +1,55 @@
|
||||
容器运行正常,成功获取到关键词并开始爬取。
|
||||
|
||||
创建完成,3 个文件
|
||||
|
||||
┌───────────────────────────┬─────────────────────────────────┐
|
||||
│ 文件 │ 用途 │
|
||||
├───────────────────────────┼─────────────────────────────────┤
|
||||
│ spider.Dockerfile │ 爬虫专用镜像,轻量无前端/nginx │
|
||||
├───────────────────────────┼─────────────────────────────────┤
|
||||
│ spider-entrypoint.sh │ 支持单容器多进程(INSTANCES=3) │
|
||||
├───────────────────────────┼─────────────────────────────────┤
|
||||
│ docker-compose.spider.yml │ 一键启动三个平台 │
|
||||
└───────────────────────────┴─────────────────────────────────┘
|
||||
|
||||
使用方式
|
||||
|
||||
单机快速启动(3 个平台各 3 个实例 = 9 个爬虫):
|
||||
docker compose -f docker-compose.spider.yml up -d
|
||||
|
||||
多台电脑部署:
|
||||
# 1. 构建并推送到你的镜像仓库
|
||||
docker build -f spider.Dockerfile -t your-registry/jobdata-spider:latest .
|
||||
docker push your-registry/jobdata-spider:latest
|
||||
|
||||
# 2. 每台电脑上拉取运行
|
||||
docker run -d --name spider-boss \
|
||||
-e PLATFORM=boss \
|
||||
-e INSTANCES=3 \
|
||||
-e API_BASE_URL=http://124.222.106.226:9999 \
|
||||
-e SLEEP_MIN_SECONDS=5 \
|
||||
-e SLEEP_MAX_SECONDS=12 \
|
||||
-e INLINE_COMPANY=0 \
|
||||
your-registry/jobdata-spider:latest
|
||||
|
||||
调参:
|
||||
# 加速:降低延迟,多开实例
|
||||
-e SLEEP_MIN_SECONDS=3 -e SLEEP_MAX_SECONDS=8 -e INSTANCES=5
|
||||
|
||||
# 需要代理
|
||||
-e PROXY_TUNNEL=proxy.example.com:8080 -e PROXY_USERNAME=xxx -e PROXY_PASSWORD=xxx
|
||||
|
||||
# Boss 需要 Token(会自动从后端 API 获取,也可手动指定)
|
||||
-e BOSS_MPT=xxx -e BOSS_WT2=xxx
|
||||
|
||||
默认配置已优化:延迟 5-12 秒(比之前 10-20
|
||||
快一倍),INLINE_COMPANY=0(关掉内联公司爬取)。3 台电脑每台 INSTANCES=3,Boss
|
||||
吞吐量可以从现在的 5 个/小时 → ~270 个/小时。
|
||||
|
||||
✻ Cooked for 2m 16s
|
||||
|
||||
───────────────────────────────────────────────────────────────────────────────────────
|
||||
❯
|
||||
───────────────────────────────────────────────────────────────────────────────────────
|
||||
Opus 4.6 (1M context) │ JobData █░░░░░░░░░ 14%
|
||||
⏵⏵ accept edits on (shift+tab to cycle)
|
||||
@ -141,15 +141,11 @@ def main():
|
||||
signer = BossSign(mpt=mpt, wt2=wt2)
|
||||
client_kwargs["signer"] = signer
|
||||
|
||||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
||||
if tunnel:
|
||||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
||||
username = os.environ.get("PROXY_USERNAME", "")
|
||||
password = os.environ.get("PROXY_PASSWORD", "")
|
||||
if username and password:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
||||
else:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
||||
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||||
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||||
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||||
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||||
print(f"[boss] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||||
|
||||
run_crawl_loop(
|
||||
platform="boss",
|
||||
|
||||
@ -90,15 +90,11 @@ def create_company_fetcher(company_id: str, http_client: Job51Client) -> BaseFet
|
||||
def main():
|
||||
client_kwargs = {}
|
||||
|
||||
tunnel = os.environ.get("PROXY_TUNNEL", "")
|
||||
if tunnel:
|
||||
scheme = os.environ.get("PROXY_SCHEME", "http")
|
||||
username = os.environ.get("PROXY_USERNAME", "")
|
||||
password = os.environ.get("PROXY_PASSWORD", "")
|
||||
if username and password:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{username}:{password}@{tunnel}"
|
||||
else:
|
||||
client_kwargs["tunnel_proxy"] = f"{scheme}://{tunnel}"
|
||||
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||||
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||||
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||||
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||||
print(f"[qcwy] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||||
|
||||
run_crawl_loop(
|
||||
platform="qcwy",
|
||||
|
||||
@ -51,6 +51,7 @@ class ZhilianClient(HTTPClient):
|
||||
self,
|
||||
base_url: str = CGATE_BASE_URL,
|
||||
signer: Optional[ZhilianSign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
timeout: int = 10,
|
||||
@ -58,6 +59,7 @@ class ZhilianClient(HTTPClient):
|
||||
super().__init__(
|
||||
base_url=base_url,
|
||||
default_headers=ZHILIAN_HEADERS,
|
||||
tunnel_proxy=tunnel_proxy,
|
||||
proxy=proxy,
|
||||
proxy_pool=proxy_pool,
|
||||
timeout=timeout,
|
||||
@ -81,18 +83,20 @@ class ZhilianClient(HTTPClient):
|
||||
|
||||
def create_cgate_client(
|
||||
signer: Optional[ZhilianSign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
) -> ZhilianClient:
|
||||
"""创建 cgate 客户端"""
|
||||
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
|
||||
return ZhilianClient(base_url=CGATE_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|
||||
|
||||
|
||||
def create_capi_client(
|
||||
signer: Optional[ZhilianSign] = None,
|
||||
tunnel_proxy: Optional[str] = None,
|
||||
proxy: Optional[str] = None,
|
||||
proxy_pool: Optional[list[str]] = None,
|
||||
) -> ZhilianClient:
|
||||
"""创建 capi 客户端"""
|
||||
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, proxy=proxy, proxy_pool=proxy_pool)
|
||||
return ZhilianClient(base_url=CAPI_BASE_URL, signer=signer, tunnel_proxy=tunnel_proxy, proxy=proxy, proxy_pool=proxy_pool)
|
||||
|
||||
|
||||
@ -91,9 +91,12 @@ def create_company_fetcher(company_id: str, http_client: ZhilianClient) -> BaseF
|
||||
|
||||
def main():
|
||||
client_kwargs = {}
|
||||
proxy = os.environ.get("PROXY_URL", "")
|
||||
if proxy:
|
||||
client_kwargs["proxy"] = proxy
|
||||
|
||||
from spiderJobs import DEFAULT_TUNNEL_PROXY
|
||||
tunnel_proxy = os.environ.get("PROXY_TUNNEL", DEFAULT_TUNNEL_PROXY)
|
||||
if tunnel_proxy and tunnel_proxy.lower() != "none":
|
||||
client_kwargs["tunnel_proxy"] = tunnel_proxy
|
||||
print(f"[zhilian] 隧道代理: {tunnel_proxy.split('@')[-1] if '@' in tunnel_proxy else tunnel_proxy}")
|
||||
|
||||
run_crawl_loop(
|
||||
platform="zhilian",
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user