This commit is contained in:
zfc 2026-01-24 17:07:34 +08:00
parent 7285475eb5
commit 3d7e96845d
21 changed files with 43687 additions and 1 deletions

View File

@ -63,6 +63,8 @@ uvicorn = "==0.34.0"
uvloop = "==0.21.0"
watchfiles = "==1.0.4"
websockets = "==14.1"
PyExecJS = "==1.5.1"
playwright = "==1.57.0"
asyncpg = "*"
pandas = "*"
openpyxl = "*"

View File

@ -17,6 +17,7 @@ from .pipeline import pipeline_router
from .keyword import keyword_router
from .cleaning import cleaning_router
from .analytics import router as analytics_router
from .company import company_router
v1_router = APIRouter()
@ -37,3 +38,4 @@ v1_router.include_router(pipeline_router, prefix="/pipeline")
v1_router.include_router(keyword_router, prefix="/keyword")
v1_router.include_router(cleaning_router, prefix="/cleaning", dependencies=[DependPermission])
v1_router.include_router(analytics_router, prefix="/analytics", tags=["数据分析"])
v1_router.include_router(company_router, prefix="/company", tags=["公司搜索"])

View File

@ -0,0 +1,3 @@
from .company import router as company_router
__all__ = ["company_router"]

View File

@ -0,0 +1,47 @@
from typing import Any, Dict, Optional
from fastapi import APIRouter, Depends
from pydantic import BaseModel, Field
from app.controllers.company import CompanyController, create_company_controller
from app.schemas.base import Fail, Success
router = APIRouter(tags=["公司搜索"])
class ZhilianSearchRequest(BaseModel):
keyword: str = Field(..., description="公司名称关键词")
city: Optional[str] = Field(None, description="城市名称,如北京")
class QcwySearchRequest(BaseModel):
keyword: str = Field(..., description="公司名称关键词")
async def get_company_controller() -> CompanyController:
return create_company_controller()
@router.post("/zhilian/search", summary="智联招聘公司搜索")
async def zhilian_search_company(
request: ZhilianSearchRequest,
controller: CompanyController = Depends(get_company_controller),
) -> Dict[str, Any]:
try:
data = await controller.search_zhilian_company(request.keyword, request.city)
return Success(data=data)
except Exception as exc:
return Fail(msg=f"智联公司搜索失败: {exc}")
@router.post("/qcwy/search", summary="前程无忧公司搜索")
async def qcwy_search_company(
request: QcwySearchRequest,
controller: CompanyController = Depends(get_company_controller),
) -> Dict[str, Any]:
try:
data = await controller.search_qcwy_company(request.keyword)
return Success(data=data)
except Exception as exc:
return Fail(msg=f"前程无忧公司搜索失败: {exc}")

View File

@ -0,0 +1,20 @@
import asyncio
from typing import Any, Dict, List, Optional
from company_spider.qcwy_company.spider import search_company as qcwy_search_company
from company_spider.zhilianzhaopin_company.spider import crawl_companies
class CompanyController:
async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
return await asyncio.to_thread(qcwy_search_company, keyword)
async def search_zhilian_company(self, keyword: str, city: Optional[str] = None) -> List[Dict[str, Any]]:
params = {"kw": keyword}
if city:
params["city"] = city
return await asyncio.to_thread(crawl_companies, params, 10)
def create_company_controller() -> CompanyController:
return CompanyController()

46
company_spider/Dockerfile Normal file
View File

@ -0,0 +1,46 @@
FROM python:3.11-slim
# 配置 apt-get 使用阿里云镜像源
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources || \
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list || true
# Install system dependencies
# Node.js is required for PyExecJS
RUN apt-get update && apt-get install -y \
nodejs \
npm \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
# 配置 npm 使用淘宝镜像源
RUN npm config set registry https://registry.npmmirror.com
WORKDIR /app
# 配置 pip 使用国内镜像源(创建配置文件)
RUN mkdir -p /root/.pip && \
echo '[global]' > /root/.pip/pip.conf && \
echo 'index-url = https://pypi.tuna.tsinghua.edu.cn/simple' >> /root/.pip/pip.conf && \
echo 'trusted-host = pypi.tuna.tsinghua.edu.cn' >> /root/.pip/pip.conf && \
echo 'timeout = 120' >> /root/.pip/pip.conf
# Copy requirements first to leverage cache
COPY requirements.txt .
# 使用配置的镜像源安装依赖
RUN pip install --no-cache-dir -r requirements.txt
# Install Playwright browsers and system dependencies
# We only need chromium for this project
# 配置 Playwright 使用国内镜像
ENV PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright
RUN playwright install chromium
RUN playwright install-deps chromium
COPY . .
# Expose the port
EXPOSE 8000
# Run the application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

View File

12
company_spider/demo Normal file
View File

@ -0,0 +1,12 @@
curl --location 'http://127.0.0.1:9999/api/v1/company/qcwy/search' \
--header 'Content-Type: application/json' \
--data '{
"keyword": "中信期货有限公司"
}'
curl --location 'http://127.0.0.1:9999/api/v1/company/zhilian/search' \
--header 'Content-Type: application/json' \
--data '{
"keyword": "中信期货有限公司",
"city":"北京"
}'

View File

@ -0,0 +1,65 @@
window = {};
var arg3 = null;
var arg4 = null;
var arg5 = null;
var arg6 = null;
var arg7 = null;
var arg8 = null;
var arg9 = null;
var arg10 = null;
var l = function (arg1) {
while (window["_phantom"] || window["__phantomas"]) {
}
var _0x5e8b26 = "3000176000856006061501533003690027800375";
String["prototype"]["hexXor"] = function (_0x4e08d8) {
var _0x5a5d3b = "";
for (var _0xe89588 = 0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
if (_0x189e2c["length"] == 1) {
_0x189e2c = "0" + _0x189e2c;
}
_0x5a5d3b += _0x189e2c;
}
return _0x5a5d3b;
};
String["prototype"]["unsbox"] = function () {
var _0x4b082b = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
var _0x4da0dc = [];
var _0x12605e = "";
for (var _0x20a7bf = 0; _0x20a7bf < this["length"]; _0x20a7bf++) {
var _0x385ee3 = this[_0x20a7bf];
for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
_0x4da0dc[_0x217721] = _0x385ee3;
}
}
}
_0x12605e = _0x4da0dc["join"]("");
return _0x12605e;
};
var _0x23a392 = arg1["unsbox"]();
arg2 = _0x23a392["hexXor"](_0x5e8b26);
console.log('arg2==>', arg2)
// setTimeout("reload(arg2)", 2);
return arg2
};
// var arg1 = "FAA6CB46CF724D58FF82E5310687947623413114";
// l(arg1)

View File

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,91 @@
import crypto from 'crypto';
// 使用 crypto
const hmacSHA256 = (message, key) => {
return crypto
.createHmac('sha256', key)
.update(message)
.digest('hex');
};
function A(t) {
console.log(t.url)
console.log(t.data)
var e = t.url + (t.data || "")
return hmacSHA256(e, "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
}
function a(e) {
for (var t = 1; t < arguments.length; t++) {
var n = null != arguments[t] ? arguments[t] : {};
t % 2 ? r(Object(n), !0).forEach((function (t) {
Object(i["a"])(e, t, n[t])
}
)) : Object.getOwnPropertyDescriptors ? Object.defineProperties(e, Object.getOwnPropertyDescriptors(n)) : r(Object(n)).forEach((function (t) {
Object.defineProperty(e, t, Object.getOwnPropertyDescriptor(n, t))
}
))
}
return e
}
var t = {
"transitional": {"silentJSONParsing": true, "forcedJSONParsing": true, "clarifyTimeoutError": false},
"transformRequest": [null],
"transformResponse": [null],
"timeout": 30000,
"xsrfCookieName": "XSRF-TOKEN",
"xsrfHeaderName": "X-XSRF-TOKEN",
"maxContentLength": -1,
"maxBodyLength": -1,
"headers": {
"common": {"Accept": "application/json, text/plain, */*"},
"delete": {},
"get": {"Content-Type": "application/x-www-form-urlencoded"},
"head": {},
"post": {"Content-Type": "application/json"},
"put": {"Content-Type": "application/x-www-form-urlencoded"},
"patch": {"Content-Type": "application/x-www-form-urlencoded"}
},
"baseURL": "https://we.51job.com",
"withCredentials": true,
"url": "/api/job/search-pc?api_key=51job&timestamp=1769136341&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&function=&industry=&jobArea=010000&jobArea2=&landmark=&metro=&salary=&workYear=&degree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=1&requestId=&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&scene=7",
"method": "get",
"property": {"keywordType": ""}
};
var b = {
"partner": "cn_bing_com",
"webId": 2,
"fromdomain": "51job_web",
"frompageUrl": "https://we.51job.com/",
"pageUrl": "https://we.51job.com/pc/search?jobArea=010000&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&keywordType=",
"identityType": "",
"userType": "",
"isLogin": "否",
"accountid": ""
}
console.log(A(t));
// function wordsToHex(words) {
// // CryptoJS 使用 32 位有符号整数存储,需要处理
// let hex = '';
// for (let i = 0; i < words.length; i++) {
// // 将负数转换为无符号整数
// const word = words[i] >>> 0;
// // 转换为十六进制并补零
// hex += word.toString(16).padStart(8, '0');
// }
// return hex;
// }
//
// // 你的数据
// const words = [-762966511, 1702028048, 94455509, -201850815,
// 300412866, 1405396681, 85275542, 246713406];
//
// console.log(wordsToHex(words))
/*
* sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTliZThkNzFmODIxM2YxLTBmZDliOTEwODEzYWE1OC00YzY1N2I1OC0zNjg2NDAwLTE5YmU4ZDcxZjgzMTcxNiJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%7D; ssxmod_itna=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2Yea0xGXKwDA5DnCx7YDt=RcwxK06dvxK=W0mitswe6uDuYLP2GGRRgW_GlDMEHLA6C5N7qxDHwd4KxGLDY=DCqxq57eD4f3Dt4DIDAYDDxDWDYEPxGUQDG=D7rTi5pWtxi3DboaDmd2WC=FD03q=EWFoDDtAbeG2bETqDDNqF9G3_lh3_PD_bW9QKtWemFxPneDMbxGX7YCqnlH2oyDWpFkUsao3xB=gxBQbyPnhwETadZanDY4lGrWYY2DIjGxWxiGG1i05Q03nwsWmwlG1Gv_GDxhw4SrUDDAt_hWDHBRqW0tK2lj5/bc_9yYtAbYW=LgrRxxWqqRAOIotBhNi47fD5=4qf0esGthu5oiPeD; ssxmod_itna2=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2YeaKxDfrQfQGh4qBFjq03_jSefWDlO03BqKSSfAaeFuhD2y0F5nKj4LMzWF2qLViLAjiLzGteYAj1KAULY4hzS3=uiLiHAktq7AQK04=RCrN4_lNnDaNPYDr4nhTEjfu/3d5Fcwil7pUxfDu7yjj5TT0UnkUbM4F0FALQk19oO64i1g2QsibdzqxtPn8oOB3wpj5FVm6R_LF2EKxZIWFfaGt9oNT4U_0IjQx40hUsUKLNOBzuR1Mh=_gTlLdLS53B3OE4dGDB8GdjhOf4MYhuE37oTUMtTCwOOD7WhhjwgohMumFghOCNeDxRqr92NTeIRW=oOeThvw7DBG5/DoFShd7v5ZxwYEKiDD
* */

View File

@ -0,0 +1,327 @@
import hashlib
import hmac
import json
import execjs
import re
import time
import uuid
import requests
from urllib.parse import unquote, quote
from typing import Optional, Dict
import os
class SignGenerator:
def __init__(self):
# 签名密钥从JS代码中获取
self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
self.secret_key_bytes = self.secret_key.encode('utf-8')
def hmac_sha256(self, message, key):
"""HMAC-SHA256签名"""
key_bytes = key.encode('utf-8') if isinstance(key, str) else key
message_bytes = message.encode('utf-8') if isinstance(message, str) else message
signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
return signature.hexdigest()
def generate_signature(self, t):
"""
生成签名对应JS中的函数A
JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
"""
# 获取URL
url = t.get("url", "")
# 获取data如果不存在则使用空字符串
data = t.get("data", "")
if data and isinstance(data, dict):
# 如果data是字典转换为字符串
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
# 拼接字符串
message = url + data
# 生成签名
signature = self.hmac_sha256(message, self.secret_key)
return signature
def generate_signature_from_components(self, url, data=None):
"""从URL和data生成签名更简单的接口"""
if data is None:
data = ""
elif isinstance(data, dict):
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
message = url + data
return self.hmac_sha256(message, self.secret_key)
def generate_acw_sc__v2(self, arg1):
"""生成cookies acw_sc__v2"""
# 获取当前文件所在目录
current_dir = os.path.dirname(os.path.abspath(__file__))
js_file_path = os.path.join(current_dir, '04.js')
with open(js_file_path, 'r', encoding='utf-8') as f:
js = f.read()
acw_sc__v2 = execjs.compile(js).call('l', arg1)
return acw_sc__v2 if acw_sc__v2 else None
def generate_company_detail(self, cid: str) -> dict:
timestamp = int(time.time())
# 待签名的字符串
message = f"/open/noauth/company-info/pc-info?api_key=51job&timestamp={timestamp}&encryCompanyId={cid}"
secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
# 进行 HMAC-SHA256 签名
signature = hmac.new(
key=secret.encode("utf-8"),
msg=message.encode("utf-8"),
digestmod=hashlib.sha256
).hexdigest()
return {"signature": signature, "timestamp": timestamp}
def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
"""
根据关键字搜索公司信息
Args:
keyword: 搜索关键字公司名称
job_area: 工作区域代码默认"000000"表示全国
Returns:
如果找到匹配的公司返回包含fullCompanyName, companyName, companyHref的字典
否则返回None
"""
signer = SignGenerator()
session = requests.Session()
# 生成时间戳
timestamp = str(int(time.time()))
# 构建请求参数
params = {
'api_key': '51job',
'timestamp': timestamp,
'keyword': keyword,
'searchType': '2', # 2表示搜索公司
'function': '',
'industry': '',
'jobArea': job_area,
'jobArea2': '',
'landmark': '',
'metro': '',
'salary': '',
'workYear': '',
'degree': '',
'companyType': '',
'companySize': '',
'jobType': '',
'issueDate': '',
'sortType': '0',
'pageNum': '1',
'requestId': '',
'pageSize': '20',
'source': '1',
'accountId': '',
'pageCode': 'sou|sou|soulb',
'scene': '7'
}
# 构建URL用于生成签名
# 注意签名时keyword需要URL编码其他参数保持原样
url_path = '/api/job/search-pc'
query_parts = []
for k, v in params.items():
if v:
# keyword参数需要URL编码与浏览器行为一致
if k == 'keyword':
query_parts.append(f'{k}={quote(str(v))}')
else:
query_parts.append(f'{k}={str(v)}')
else:
query_parts.append(f'{k}=')
query_string = '&'.join(query_parts)
full_url = f"{url_path}?{query_string}"
# 生成签名
sign = signer.generate_signature_from_components(full_url)
# 构建请求头
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'From-Domain': '51job_web',
'Pragma': 'no-cache',
'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'account-id': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sign': sign,
'user-token': '',
'uuid': str(uuid.uuid4()).replace('-', ''),
}
try:
# 第一次请求可能会返回arg1用于生成acw_sc__v2
response = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
verify=False,
timeout=30
)
# 检查是否需要处理acw_sc__v2反爬虫
if 'arg1' in response.text:
# 提取arg1
arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
if arg1_match:
arg1 = arg1_match[0]
# 生成acw_sc__v2
acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
if acw_sc__v2:
# 生成guid
guid = str(uuid.uuid4()).replace("-", "")
cookies = {
'guid': guid,
'acw_sc__v2': acw_sc__v2
}
# 第二次请求带上cookies
response2 = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
cookies=cookies,
verify=False,
timeout=30
)
# 更新cookies
cookies.update(response2.cookies.get_dict())
# 第三次请求使用完整的cookies
response = session.get(
'https://we.51job.com/api/job/search-pc',
params=params,
headers=headers,
cookies=cookies,
verify=False,
timeout=30
)
# 解析响应
if response.status_code == 200:
try:
data = response.json()
# print(data)
if data.get('status') == '1' and 'resultbody' in data:
resultbody = data['resultbody']
if 'job' in resultbody and 'items' in resultbody['job']:
items = resultbody['job']['items']
# 遍历所有职位,查找匹配的公司
for item in items:
print(item)
full_company_name = item.get('fullCompanyName', '').strip()
if full_company_name == keyword.strip():
return {
'fullCompanyName': full_company_name,
'companyName': item.get('companyName', '').strip(),
'companyHref': item.get('companyHref', '').strip()
}
except json.JSONDecodeError:
print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
return None
return None
except Exception as e:
print(f"[错误] 请求失败: {e}")
import traceback
print(traceback.format_exc())
return None
def parse_json_company_desc(uri: str) -> dict:
"""解析HTML页面返回字典格式不使用pandas
Args:
uri: 页面URL
Returns:
包含location和company_desc的字典
"""
"https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
sy = uri.split("/")[-1].replace(".html", "")
# print(sy)
if sy.startswith("co"):
cid = sy.replace("co", "")
else:
cid = sy
signer = SignGenerator()
generate_company_detail_info = signer.generate_company_detail(cid)
headers = {
'Host': 'cupid.51job.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'sign': generate_company_detail_info["signature"],
'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
'From-Domain': '51job_web',
'account-id': '',
'user-token': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
'Origin': 'https://jobs.51job.com',
'Connection': 'keep-alive',
'Referer': 'https://jobs.51job.com/',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-site',
'TE': 'trailers',
}
try:
# 使用已配置的代理发送请求
desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job&timestamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
res = requests.get(url=desc_url, headers=headers, verify=False)
# print(res.text)
if not res:
return {"company_desc": "请求失败", "company_location": "请求失败"}
company_dinfo = res.json()
print(company_dinfo)
# print(company_dinfo["resultbody"])
coinfo = company_dinfo["resultbody"]["coinfo"]
return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
except Exception as e:
print(f"解析HTML失败: {e}")
return {"company_desc": "解析失败", "company_location": "解析失败"}
# 使用示例
if __name__ == "__main__":
# 测试搜索
keyword = "华为技术有限公司"
result = search_company(keyword)
if result:
print(f"找到匹配的公司:")
print(f" 全称: {result['fullCompanyName']}")
print(f" 简称: {result['companyName']}")
print(f" 链接: {result['companyHref']}")
else:
print(f"未找到匹配的公司: {keyword}")
print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))

View File

@ -0,0 +1,92 @@
# -*- coding: UTF-8 -*-
import time
import uuid
import requests
import re
import json
import execjs
headers = {
'Accept': 'application/json, text/plain, */*',
'Accept-Language': 'zh',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'From-Domain': '51job_web',
'Pragma': 'no-cache',
'Referer': 'https://we.51job.com/pc/search?keyword=java&searchType=2&sortType=0&metro=',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
'account-id': '',
'partner': '',
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3Djava%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sign': '839932c059141791d8a003f0e6652e14facbf788a502df374fecf9c107d93b9e',
'user-token': '',
'uuid': '1687228791235576552',
}
params = {
'api_key': '51job',
'timestamp': '1769139097',
'keyword': '华为技术有限公司',
'searchType': '2',
'function': '',
'industry': '',
'jobArea': '000000',
'jobArea2': '',
'landmark': '',
'metro': '',
'salary': '',
'workYear': '',
'degree': '',
'companyType': '',
'companySize': '',
'jobType': '',
'issueDate': '',
'sortType': '0',
'pageNum': '1',
'requestId': '',
'pageSize': '20',
'source': '1',
'accountId': '',
'pageCode': 'sou|sou|soulb',
'scene':'7'
}
# 换成自己的代理或者不用单个ip应该有限制
proxies = {
"http":"http://xxx",
"https":"http://xxxx"
}
for i in range(1,2):
try:
# cookie = {'guid': 'd02dfbabd84858301947663946e1710f'}
session = requests.session()
print("%s次请求:" % i)
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,verify=False) # 关键禁用SSL验证)
print(response.text[:300])
arg1 = re.findall("arg1='(.*?)';",response.text,re.S)[0]
print('arg1--->',arg1)
guid = str(uuid.uuid4()).replace("-", "")
cookie = {'guid': str(guid)}
with open('04.js', 'r', encoding='utf-8') as f:
js = f.read()
acw_sc__v2 = execjs.compile(js).call('l', arg1)
print('acw_sc__v2-->',acw_sc__v2)
cookie.update({"acw_sc__v2": acw_sc__v2})
# cookie.update({"acw_sc__v3": "649257ebe376df87b3db6a94c1e5ad37f42f783b"})
response2 = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,cookies=cookie,verify=False) #
cookie.update(response2.cookies.get_dict())
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers, cookies=cookie,verify=False)
print(response.text)
time.sleep(0.5)
except Exception as e:
print(e)

View File

@ -0,0 +1 @@
# 智联招聘

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,68 @@
import math
import copy
R = "0123456789ABCDEFGHIJKLMNOPQRSTUV"
def a(e: str) -> str:
t = int(e, 2)
return R[t]
def n(e: str) -> str:
t = ""
# 等价 charCodeAt + 16bit
for ch in e:
t += format(ord(ch), "016b")
# padEnd 到 5 的倍数
pad_len = 5 * math.ceil(len(t) / 5)
t = t.ljust(pad_len, "0")
r = ""
for i in range(0, len(t), 5):
r += a(t[i:i + 5])
return r
def generate_url(e: dict) -> str:
t = []
o = copy.deepcopy(e)
if o.get("jl"):
t.append(f"jl{o['jl']}")
del o["jl"]
if o.get("jt"):
t.append(f"jt{o['jt']}")
del o["jt"]
if o.get("in"):
t.append(f"in{o['in']}")
del o["in"]
if o.get("kw"):
t.append(f"kw{n(o['kw'])}")
del o["kw"]
if o.get("p"):
t.append(f"p{o['p']}")
del o["p"]
r = []
for key, value in o.items():
if value:
r.append(f"{key}={value}")
a_path = "/".join(t)
if r:
a_path += "?" + "&".join(r)
return a_path
if __name__ == '__main__':
url = f"https://www.zhaopin.com/sou/{generate_url({'jl': 530, 'kw': 'app推广经理'})}"
print(url)

View File

@ -0,0 +1,284 @@
from playwright.sync_api import sync_playwright, BrowserContext, Page
import time
import json
import os
from typing import List, Dict, Optional
from company_spider.zhilianzhaopin_company.searcc_kw import generate_url
class CityLoader:
_instance = None
def __new__(cls, *args, **kwargs):
if not cls._instance:
cls._instance = super(CityLoader, cls).__new__(cls)
return cls._instance
def __init__(self, city_file="city.json"):
if hasattr(self, 'city_map'):
return
current_dir = os.path.dirname(os.path.abspath(__file__))
self.file_path = os.path.join(current_dir, city_file)
self.city_map = {}
self._load_cities()
def _load_cities(self):
if not os.path.exists(self.file_path):
print(f"City file not found: {self.file_path}")
return
try:
with open(self.file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self._parse_city_data(data.get("allCity", []))
except Exception as e:
print(f"Error loading city file: {e}")
def _parse_city_data(self, cities):
for city in cities:
self.city_map[city['name']] = city['code']
if 'sublist' in city and city['sublist']:
self._parse_city_data(city['sublist'])
def get_code(self, city_name):
return self.city_map.get(city_name)
def get_companies_from_page(page: Page) -> List[Dict[str, str]]:
"""从搜索结果页面获取公司名称和链接"""
companies = []
# 尝试多种选择器来定位公司名称
company_selectors = [
'a[class*="company"]',
'.company-name a',
'a.company-name',
'[class*="CompanyName"] a',
'a[href*="/company/"]'
]
company_elements = []
for selector in company_selectors:
try:
elements = page.query_selector_all(selector)
if elements:
company_elements = elements
print(f"使用选择器找到 {len(elements)} 个元素: {selector}")
break
except:
continue
# 如果没找到,尝试更通用的方法
if not company_elements:
all_links = page.query_selector_all('a[href*="company"]')
company_elements = all_links
print(f"通过通用方法找到 {len(all_links)} 个公司链接")
# 提取公司信息
company_info_set = set()
for element in company_elements:
try:
company_name = element.inner_text().strip()
company_url = element.get_attribute('href')
if company_name and company_url:
# 处理相对路径
if company_url.startswith('/'):
company_url = f"https://www.zhaopin.com{company_url}"
elif not company_url.startswith('http'):
company_url = f"https://www.zhaopin.com/{company_url}"
# 去重
if company_name not in company_info_set:
company_info_set.add(company_name)
companies.append({
'name': company_name,
'url': company_url
})
except Exception as e:
continue
return companies
def get_company_intro(context: BrowserContext, company_url: str) -> str:
"""获取公司详情简介"""
try:
company_page = context.new_page()
company_page.goto(company_url, wait_until="networkidle", timeout=30000)
time.sleep(2)
# 尝试多种选择器获取公司简介
intro_selectors = [
'.company-intro',
'.company-description',
'[class*="intro"]',
'[class*="description"]',
'.company-info',
'[class*="CompanyIntro"]'
]
company_intro = ""
for selector in intro_selectors:
try:
intro_element = company_page.query_selector(selector)
if intro_element:
company_intro = intro_element.inner_text().strip()
if company_intro:
break
except:
continue
# 如果还是没找到,尝试获取页面主要内容
if not company_intro:
try:
body = company_page.query_selector('body')
if body:
all_text = body.inner_text()
company_intro = all_text[:500]
except:
pass
company_page.close()
return company_intro if company_intro else "未找到公司简介"
except Exception as e:
return f"获取失败: {str(e)}"
def crawl_companies(params: Dict, max_companies: int = 10, headless: bool = False, proxy: Optional[str] = None) -> List[Dict]:
"""
爬取智联招聘公司信息
Args:
params: 搜索参数 {'jl': 530, 'kw': 'app推广经理'} {'city': '北京', 'kw': '...'}
max_companies: 最多爬取的公司数量默认10
headless: 是否无头模式默认False
proxy: 代理地址例如 "http://user:pass@host:port"
Returns:
公司信息列表每个元素包含 name, url, intro
如果找到完全匹配的公司名称只返回该公司的信息列表长度为1
"""
# 处理城市名称转代码
if 'city' in params and 'jl' not in params:
city_loader = CityLoader()
code = city_loader.get_code(params['city'])
if code:
print(f"城市 '{params['city']}' 映射代码为: {code}")
params['jl'] = code
else:
print(f"未找到城市 '{params['city']}' 的代码")
target_company = params.get('kw', '').strip()
with sync_playwright() as p:
launch_args = ["--disable-blink-features=AutomationControlled"]
browser_kwargs = {
"headless": headless,
"args": launch_args
}
# 尝试使用本地Chrome如果不存在则使用默认浏览器
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
if os.path.exists(chrome_path):
browser_kwargs["executable_path"] = chrome_path
if proxy:
browser_kwargs["proxy"] = {"server": proxy}
print(f"使用代理: {proxy}")
browser = p.chromium.launch(**browser_kwargs)
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = context.new_page()
# 生成URL并访问搜索页面
url = f"https://www.zhaopin.com/sou/{generate_url(params)}"
print(f"访问URL: {url}")
page.goto(url, wait_until="networkidle", timeout=30000)
time.sleep(3)
# 获取公司列表
companies = get_companies_from_page(page)
print(f"\n找到 {len(companies)} 家公司")
# 如果有关键词,尝试精确匹配公司名称
if target_company:
print(f"搜索目标公司: {target_company}")
for company in companies:
company_name = company['name'].strip()
# 优先精确匹配,如果精确匹配失败则尝试包含匹配
if company_name == target_company:
print(f"找到完全匹配的公司: {company_name}")
print(f"正在获取公司简介...")
company_intro = get_company_intro(context, company['url'])
context.close()
browser.close()
return [{
'name': company_name,
'url': company['url'],
'intro': company_intro
}]
# 如果精确匹配失败,尝试包含匹配
for company in companies:
company_name = company['name'].strip()
if target_company in company_name or company_name in target_company:
print(f"找到部分匹配的公司: {company_name}")
print(f"正在获取公司简介...")
company_intro = get_company_intro(context, company['url'])
context.close()
browser.close()
return [{
'name': company_name,
'url': company['url'],
'intro': company_intro
}]
# 如果没有找到匹配的公司,按原逻辑获取多家公司
print(f"未找到完全匹配的公司,获取前 {max_companies} 家公司信息")
results = []
for i, company in enumerate(companies[:max_companies], 1):
print(f"\n[{i}/{min(max_companies, len(companies))}] 正在获取: {company['name']}")
company_intro = get_company_intro(context, company['url'])
results.append({
'name': company['name'],
'url': company['url'],
'intro': company_intro
})
time.sleep(1)
context.close()
browser.close()
return results
if __name__ == '__main__':
# 测试代码
params = {'city': '北京', 'kw': 'app推广经理'}
results = crawl_companies(params, max_companies=10)
# 输出结果
print("\n" + "="*80)
print("爬取结果:")
print("="*80)
for result in results:
print(f"\n公司名称: {result['name']}")
print(f"公司链接: {result['url']}")
print(f"公司简介: {result['intro'][:200]}..." if len(result['intro']) > 200 else f"公司简介: {result['intro']}")
print("-"*80)

View File

@ -0,0 +1,6 @@
{"timestamp": "2026-01-15 00:38:02", "total_crawled": 517, "unique_count": 503, "duplicate_count": 14, "api_total_count": 505, "job_area": "商丘", "function_type": "8305"}
{"timestamp": "2026-01-15 01:36:23", "total_crawled": 517, "unique_count": 509, "duplicate_count": 8, "api_total_count": 517, "job_area": "广安", "function_type": "1318"}
{"timestamp": "2026-01-15 02:32:36", "total_crawled": 517, "unique_count": 511, "duplicate_count": 6, "api_total_count": 517, "job_area": "阜阳", "function_type": "6101"}
{"timestamp": "2026-01-15 03:32:52", "total_crawled": 517, "unique_count": 513, "duplicate_count": 4, "api_total_count": 517, "job_area": "常德", "function_type": "3812"}
{"timestamp": "2026-01-15 04:31:42", "total_crawled": 517, "unique_count": 510, "duplicate_count": 7, "api_total_count": 517, "job_area": "惠州", "function_type": "3335"}
{"timestamp": "2026-01-15 05:28:54", "total_crawled": 517, "unique_count": 515, "duplicate_count": 2, "api_total_count": 517, "job_area": "锦州", "function_type": "0154"}

File diff suppressed because it is too large Load Diff

View File

@ -25,7 +25,7 @@ os.makedirs("logs", exist_ok=True)
logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)
API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')
API_PUBLIC_HOST = os.getenv('API_PUBLIC_HOST')
PROXY_URL = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"