up
This commit is contained in:
parent
7285475eb5
commit
3d7e96845d
2
Pipfile
2
Pipfile
@ -63,6 +63,8 @@ uvicorn = "==0.34.0"
|
||||
uvloop = "==0.21.0"
|
||||
watchfiles = "==1.0.4"
|
||||
websockets = "==14.1"
|
||||
PyExecJS = "==1.5.1"
|
||||
playwright = "==1.57.0"
|
||||
asyncpg = "*"
|
||||
pandas = "*"
|
||||
openpyxl = "*"
|
||||
|
||||
@ -17,6 +17,7 @@ from .pipeline import pipeline_router
|
||||
from .keyword import keyword_router
|
||||
from .cleaning import cleaning_router
|
||||
from .analytics import router as analytics_router
|
||||
from .company import company_router
|
||||
|
||||
|
||||
v1_router = APIRouter()
|
||||
@ -37,3 +38,4 @@ v1_router.include_router(pipeline_router, prefix="/pipeline")
|
||||
v1_router.include_router(keyword_router, prefix="/keyword")
|
||||
v1_router.include_router(cleaning_router, prefix="/cleaning", dependencies=[DependPermission])
|
||||
v1_router.include_router(analytics_router, prefix="/analytics", tags=["数据分析"])
|
||||
v1_router.include_router(company_router, prefix="/company", tags=["公司搜索"])
|
||||
|
||||
3
app/api/v1/company/__init__.py
Normal file
3
app/api/v1/company/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
from .company import router as company_router
|
||||
|
||||
__all__ = ["company_router"]
|
||||
47
app/api/v1/company/company.py
Normal file
47
app/api/v1/company/company.py
Normal file
@ -0,0 +1,47 @@
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from app.controllers.company import CompanyController, create_company_controller
|
||||
from app.schemas.base import Fail, Success
|
||||
|
||||
|
||||
router = APIRouter(tags=["公司搜索"])
|
||||
|
||||
|
||||
class ZhilianSearchRequest(BaseModel):
|
||||
keyword: str = Field(..., description="公司名称关键词")
|
||||
city: Optional[str] = Field(None, description="城市名称,如北京")
|
||||
|
||||
|
||||
class QcwySearchRequest(BaseModel):
|
||||
keyword: str = Field(..., description="公司名称关键词")
|
||||
|
||||
|
||||
async def get_company_controller() -> CompanyController:
|
||||
return create_company_controller()
|
||||
|
||||
|
||||
@router.post("/zhilian/search", summary="智联招聘公司搜索")
|
||||
async def zhilian_search_company(
|
||||
request: ZhilianSearchRequest,
|
||||
controller: CompanyController = Depends(get_company_controller),
|
||||
) -> Dict[str, Any]:
|
||||
try:
|
||||
data = await controller.search_zhilian_company(request.keyword, request.city)
|
||||
return Success(data=data)
|
||||
except Exception as exc:
|
||||
return Fail(msg=f"智联公司搜索失败: {exc}")
|
||||
|
||||
|
||||
@router.post("/qcwy/search", summary="前程无忧公司搜索")
|
||||
async def qcwy_search_company(
|
||||
request: QcwySearchRequest,
|
||||
controller: CompanyController = Depends(get_company_controller),
|
||||
) -> Dict[str, Any]:
|
||||
try:
|
||||
data = await controller.search_qcwy_company(request.keyword)
|
||||
return Success(data=data)
|
||||
except Exception as exc:
|
||||
return Fail(msg=f"前程无忧公司搜索失败: {exc}")
|
||||
20
app/controllers/company.py
Normal file
20
app/controllers/company.py
Normal file
@ -0,0 +1,20 @@
|
||||
import asyncio
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from company_spider.qcwy_company.spider import search_company as qcwy_search_company
|
||||
from company_spider.zhilianzhaopin_company.spider import crawl_companies
|
||||
|
||||
|
||||
class CompanyController:
|
||||
async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
|
||||
return await asyncio.to_thread(qcwy_search_company, keyword)
|
||||
|
||||
async def search_zhilian_company(self, keyword: str, city: Optional[str] = None) -> List[Dict[str, Any]]:
|
||||
params = {"kw": keyword}
|
||||
if city:
|
||||
params["city"] = city
|
||||
return await asyncio.to_thread(crawl_companies, params, 10)
|
||||
|
||||
|
||||
def create_company_controller() -> CompanyController:
|
||||
return CompanyController()
|
||||
46
company_spider/Dockerfile
Normal file
46
company_spider/Dockerfile
Normal file
@ -0,0 +1,46 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
# 配置 apt-get 使用阿里云镜像源
|
||||
RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources || \
|
||||
sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list || true
|
||||
|
||||
# Install system dependencies
|
||||
# Node.js is required for PyExecJS
|
||||
RUN apt-get update && apt-get install -y \
|
||||
nodejs \
|
||||
npm \
|
||||
&& apt-get clean \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# 配置 npm 使用淘宝镜像源
|
||||
RUN npm config set registry https://registry.npmmirror.com
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# 配置 pip 使用国内镜像源(创建配置文件)
|
||||
RUN mkdir -p /root/.pip && \
|
||||
echo '[global]' > /root/.pip/pip.conf && \
|
||||
echo 'index-url = https://pypi.tuna.tsinghua.edu.cn/simple' >> /root/.pip/pip.conf && \
|
||||
echo 'trusted-host = pypi.tuna.tsinghua.edu.cn' >> /root/.pip/pip.conf && \
|
||||
echo 'timeout = 120' >> /root/.pip/pip.conf
|
||||
|
||||
# Copy requirements first to leverage cache
|
||||
COPY requirements.txt .
|
||||
|
||||
# 使用配置的镜像源安装依赖
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Install Playwright browsers and system dependencies
|
||||
# We only need chromium for this project
|
||||
# 配置 Playwright 使用国内镜像
|
||||
ENV PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright
|
||||
RUN playwright install chromium
|
||||
RUN playwright install-deps chromium
|
||||
|
||||
COPY . .
|
||||
|
||||
# Expose the port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
0
company_spider/__init__.py
Normal file
0
company_spider/__init__.py
Normal file
12
company_spider/demo
Normal file
12
company_spider/demo
Normal file
@ -0,0 +1,12 @@
|
||||
curl --location 'http://127.0.0.1:9999/api/v1/company/qcwy/search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"keyword": "中信期货有限公司"
|
||||
}'
|
||||
|
||||
curl --location 'http://127.0.0.1:9999/api/v1/company/zhilian/search' \
|
||||
--header 'Content-Type: application/json' \
|
||||
--data '{
|
||||
"keyword": "中信期货有限公司",
|
||||
"city":"北京"
|
||||
}'
|
||||
65
company_spider/qcwy_company/04.js
Normal file
65
company_spider/qcwy_company/04.js
Normal file
@ -0,0 +1,65 @@
|
||||
window = {};
|
||||
|
||||
var arg3 = null;
|
||||
var arg4 = null;
|
||||
var arg5 = null;
|
||||
var arg6 = null;
|
||||
var arg7 = null;
|
||||
var arg8 = null;
|
||||
var arg9 = null;
|
||||
var arg10 = null;
|
||||
|
||||
var l = function (arg1) {
|
||||
while (window["_phantom"] || window["__phantomas"]) {
|
||||
}
|
||||
|
||||
var _0x5e8b26 = "3000176000856006061501533003690027800375";
|
||||
|
||||
String["prototype"]["hexXor"] = function (_0x4e08d8) {
|
||||
var _0x5a5d3b = "";
|
||||
|
||||
for (var _0xe89588 = 0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
|
||||
var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
|
||||
|
||||
var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
|
||||
|
||||
var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
|
||||
|
||||
if (_0x189e2c["length"] == 1) {
|
||||
_0x189e2c = "0" + _0x189e2c;
|
||||
}
|
||||
|
||||
_0x5a5d3b += _0x189e2c;
|
||||
}
|
||||
|
||||
return _0x5a5d3b;
|
||||
};
|
||||
|
||||
String["prototype"]["unsbox"] = function () {
|
||||
var _0x4b082b = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
|
||||
var _0x4da0dc = [];
|
||||
var _0x12605e = "";
|
||||
|
||||
for (var _0x20a7bf = 0; _0x20a7bf < this["length"]; _0x20a7bf++) {
|
||||
var _0x385ee3 = this[_0x20a7bf];
|
||||
|
||||
for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
|
||||
if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
|
||||
_0x4da0dc[_0x217721] = _0x385ee3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_0x12605e = _0x4da0dc["join"]("");
|
||||
return _0x12605e;
|
||||
};
|
||||
|
||||
var _0x23a392 = arg1["unsbox"]();
|
||||
|
||||
arg2 = _0x23a392["hexXor"](_0x5e8b26);
|
||||
console.log('arg2==>', arg2)
|
||||
// setTimeout("reload(arg2)", 2);
|
||||
return arg2
|
||||
};
|
||||
// var arg1 = "FAA6CB46CF724D58FF82E5310687947623413114";
|
||||
// l(arg1)
|
||||
0
company_spider/qcwy_company/__init__.py
Normal file
0
company_spider/qcwy_company/__init__.py
Normal file
6533
company_spider/qcwy_company/out.json
Normal file
6533
company_spider/qcwy_company/out.json
Normal file
File diff suppressed because it is too large
Load Diff
91
company_spider/qcwy_company/qcwy.js
Normal file
91
company_spider/qcwy_company/qcwy.js
Normal file
@ -0,0 +1,91 @@
|
||||
import crypto from 'crypto';
|
||||
|
||||
// 使用 crypto
|
||||
const hmacSHA256 = (message, key) => {
|
||||
return crypto
|
||||
.createHmac('sha256', key)
|
||||
.update(message)
|
||||
.digest('hex');
|
||||
};
|
||||
|
||||
function A(t) {
|
||||
console.log(t.url)
|
||||
console.log(t.data)
|
||||
var e = t.url + (t.data || "")
|
||||
return hmacSHA256(e, "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
|
||||
}
|
||||
|
||||
|
||||
function a(e) {
|
||||
for (var t = 1; t < arguments.length; t++) {
|
||||
var n = null != arguments[t] ? arguments[t] : {};
|
||||
t % 2 ? r(Object(n), !0).forEach((function (t) {
|
||||
Object(i["a"])(e, t, n[t])
|
||||
}
|
||||
)) : Object.getOwnPropertyDescriptors ? Object.defineProperties(e, Object.getOwnPropertyDescriptors(n)) : r(Object(n)).forEach((function (t) {
|
||||
Object.defineProperty(e, t, Object.getOwnPropertyDescriptor(n, t))
|
||||
}
|
||||
))
|
||||
}
|
||||
return e
|
||||
}
|
||||
|
||||
var t = {
|
||||
"transitional": {"silentJSONParsing": true, "forcedJSONParsing": true, "clarifyTimeoutError": false},
|
||||
"transformRequest": [null],
|
||||
"transformResponse": [null],
|
||||
"timeout": 30000,
|
||||
"xsrfCookieName": "XSRF-TOKEN",
|
||||
"xsrfHeaderName": "X-XSRF-TOKEN",
|
||||
"maxContentLength": -1,
|
||||
"maxBodyLength": -1,
|
||||
"headers": {
|
||||
"common": {"Accept": "application/json, text/plain, */*"},
|
||||
"delete": {},
|
||||
"get": {"Content-Type": "application/x-www-form-urlencoded"},
|
||||
"head": {},
|
||||
"post": {"Content-Type": "application/json"},
|
||||
"put": {"Content-Type": "application/x-www-form-urlencoded"},
|
||||
"patch": {"Content-Type": "application/x-www-form-urlencoded"}
|
||||
},
|
||||
"baseURL": "https://we.51job.com",
|
||||
"withCredentials": true,
|
||||
"url": "/api/job/search-pc?api_key=51job×tamp=1769136341&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&function=&industry=&jobArea=010000&jobArea2=&landmark=&metro=&salary=&workYear=°ree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=1&requestId=&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&scene=7",
|
||||
"method": "get",
|
||||
"property": {"keywordType": ""}
|
||||
};
|
||||
|
||||
var b = {
|
||||
"partner": "cn_bing_com",
|
||||
"webId": 2,
|
||||
"fromdomain": "51job_web",
|
||||
"frompageUrl": "https://we.51job.com/",
|
||||
"pageUrl": "https://we.51job.com/pc/search?jobArea=010000&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&keywordType=",
|
||||
"identityType": "",
|
||||
"userType": "",
|
||||
"isLogin": "否",
|
||||
"accountid": ""
|
||||
}
|
||||
console.log(A(t));
|
||||
|
||||
// function wordsToHex(words) {
|
||||
// // CryptoJS 使用 32 位有符号整数存储,需要处理
|
||||
// let hex = '';
|
||||
// for (let i = 0; i < words.length; i++) {
|
||||
// // 将负数转换为无符号整数
|
||||
// const word = words[i] >>> 0;
|
||||
// // 转换为十六进制并补零
|
||||
// hex += word.toString(16).padStart(8, '0');
|
||||
// }
|
||||
// return hex;
|
||||
// }
|
||||
//
|
||||
// // 你的数据
|
||||
// const words = [-762966511, 1702028048, 94455509, -201850815,
|
||||
// 300412866, 1405396681, 85275542, 246713406];
|
||||
//
|
||||
// console.log(wordsToHex(words))
|
||||
|
||||
/*
|
||||
* sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTliZThkNzFmODIxM2YxLTBmZDliOTEwODEzYWE1OC00YzY1N2I1OC0zNjg2NDAwLTE5YmU4ZDcxZjgzMTcxNiJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%7D; ssxmod_itna=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2Yea0xGXKwDA5DnCx7YDt=RcwxK06dvxK=W0mitswe6uDuYLP2GGRRgW_GlDMEHLA6C5N7qxDHwd4KxGLDY=DCqxq57eD4f3Dt4DIDAYDDxDWDYEPxGUQDG=D7rTi5pWtxi3DboaDmd2WC=FD03q=EWFoDDtAbeG2bETqDDNqF9G3_lh3_PD_bW9QKtWemFxPneDMbxGX7YCqnlH2oyDWpFkUsao3xB=gxBQbyPnhwETadZanDY4lGrWYY2DIjGxWxiGG1i05Q03nwsWmwlG1Gv_GDxhw4SrUDDAt_hWDHBRqW0tK2lj5/bc_9yYtAbYW=LgrRxxWqqRAOIotBhNi47fD5=4qf0esGthu5oiPeD; ssxmod_itna2=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2YeaKxDfrQfQGh4qBFjq03_jSefWDlO03BqKSSfAaeFuhD2y0F5nKj4LMzWF2qLViLAjiLzGteYAj1KAULY4hzS3=uiLiHAktq7AQK04=RCrN4_lNnDaNPYDr4nhTEjfu/3d5Fcwil7pUxfDu7yjj5TT0UnkUbM4F0FALQk19oO64i1g2QsibdzqxtPn8oOB3wpj5FVm6R_LF2EKxZIWFfaGt9oNT4U_0IjQx40hUsUKLNOBzuR1Mh=_gTlLdLS53B3OE4dGDB8GdjhOf4MYhuE37oTUMtTCwOOD7WhhjwgohMumFghOCNeDxRqr92NTeIRW=oOeThvw7DBG5/DoFShd7v5ZxwYEKiDD
|
||||
* */
|
||||
327
company_spider/qcwy_company/spider.py
Normal file
327
company_spider/qcwy_company/spider.py
Normal file
@ -0,0 +1,327 @@
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import execjs
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
from urllib.parse import unquote, quote
|
||||
from typing import Optional, Dict
|
||||
import os
|
||||
|
||||
|
||||
class SignGenerator:
|
||||
def __init__(self):
|
||||
# 签名密钥(从JS代码中获取)
|
||||
self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||||
self.secret_key_bytes = self.secret_key.encode('utf-8')
|
||||
|
||||
def hmac_sha256(self, message, key):
|
||||
"""HMAC-SHA256签名"""
|
||||
key_bytes = key.encode('utf-8') if isinstance(key, str) else key
|
||||
message_bytes = message.encode('utf-8') if isinstance(message, str) else message
|
||||
|
||||
signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
|
||||
return signature.hexdigest()
|
||||
|
||||
def generate_signature(self, t):
|
||||
"""
|
||||
生成签名(对应JS中的函数A)
|
||||
JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
|
||||
"""
|
||||
# 获取URL
|
||||
url = t.get("url", "")
|
||||
|
||||
# 获取data,如果不存在则使用空字符串
|
||||
data = t.get("data", "")
|
||||
if data and isinstance(data, dict):
|
||||
# 如果data是字典,转换为字符串
|
||||
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
||||
|
||||
# 拼接字符串
|
||||
message = url + data
|
||||
|
||||
# 生成签名
|
||||
signature = self.hmac_sha256(message, self.secret_key)
|
||||
return signature
|
||||
|
||||
def generate_signature_from_components(self, url, data=None):
|
||||
"""从URL和data生成签名(更简单的接口)"""
|
||||
if data is None:
|
||||
data = ""
|
||||
elif isinstance(data, dict):
|
||||
data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
|
||||
|
||||
message = url + data
|
||||
return self.hmac_sha256(message, self.secret_key)
|
||||
|
||||
def generate_acw_sc__v2(self, arg1):
|
||||
"""生成cookies acw_sc__v2"""
|
||||
# 获取当前文件所在目录
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
js_file_path = os.path.join(current_dir, '04.js')
|
||||
with open(js_file_path, 'r', encoding='utf-8') as f:
|
||||
js = f.read()
|
||||
acw_sc__v2 = execjs.compile(js).call('l', arg1)
|
||||
return acw_sc__v2 if acw_sc__v2 else None
|
||||
|
||||
def generate_company_detail(self, cid: str) -> dict:
|
||||
|
||||
timestamp = int(time.time())
|
||||
# 待签名的字符串
|
||||
message = f"/open/noauth/company-info/pc-info?api_key=51job×tamp={timestamp}&encryCompanyId={cid}"
|
||||
secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
|
||||
# 进行 HMAC-SHA256 签名
|
||||
signature = hmac.new(
|
||||
key=secret.encode("utf-8"),
|
||||
msg=message.encode("utf-8"),
|
||||
digestmod=hashlib.sha256
|
||||
).hexdigest()
|
||||
return {"signature": signature, "timestamp": timestamp}
|
||||
|
||||
|
||||
def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
|
||||
"""
|
||||
根据关键字搜索公司信息
|
||||
|
||||
Args:
|
||||
keyword: 搜索关键字(公司名称)
|
||||
job_area: 工作区域代码,默认"000000"表示全国
|
||||
|
||||
Returns:
|
||||
如果找到匹配的公司,返回包含fullCompanyName, companyName, companyHref的字典
|
||||
否则返回None
|
||||
"""
|
||||
signer = SignGenerator()
|
||||
session = requests.Session()
|
||||
|
||||
# 生成时间戳
|
||||
timestamp = str(int(time.time()))
|
||||
|
||||
# 构建请求参数
|
||||
params = {
|
||||
'api_key': '51job',
|
||||
'timestamp': timestamp,
|
||||
'keyword': keyword,
|
||||
'searchType': '2', # 2表示搜索公司
|
||||
'function': '',
|
||||
'industry': '',
|
||||
'jobArea': job_area,
|
||||
'jobArea2': '',
|
||||
'landmark': '',
|
||||
'metro': '',
|
||||
'salary': '',
|
||||
'workYear': '',
|
||||
'degree': '',
|
||||
'companyType': '',
|
||||
'companySize': '',
|
||||
'jobType': '',
|
||||
'issueDate': '',
|
||||
'sortType': '0',
|
||||
'pageNum': '1',
|
||||
'requestId': '',
|
||||
'pageSize': '20',
|
||||
'source': '1',
|
||||
'accountId': '',
|
||||
'pageCode': 'sou|sou|soulb',
|
||||
'scene': '7'
|
||||
}
|
||||
|
||||
# 构建URL用于生成签名
|
||||
# 注意:签名时keyword需要URL编码,其他参数保持原样
|
||||
url_path = '/api/job/search-pc'
|
||||
query_parts = []
|
||||
for k, v in params.items():
|
||||
if v:
|
||||
# keyword参数需要URL编码(与浏览器行为一致)
|
||||
if k == 'keyword':
|
||||
query_parts.append(f'{k}={quote(str(v))}')
|
||||
else:
|
||||
query_parts.append(f'{k}={str(v)}')
|
||||
else:
|
||||
query_parts.append(f'{k}=')
|
||||
query_string = '&'.join(query_parts)
|
||||
full_url = f"{url_path}?{query_string}"
|
||||
|
||||
# 生成签名
|
||||
sign = signer.generate_signature_from_components(full_url)
|
||||
|
||||
# 构建请求头
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'From-Domain': '51job_web',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||
'account-id': '',
|
||||
'partner': '',
|
||||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
|
||||
keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
|
||||
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sign': sign,
|
||||
'user-token': '',
|
||||
'uuid': str(uuid.uuid4()).replace('-', ''),
|
||||
}
|
||||
|
||||
try:
|
||||
# 第一次请求,可能会返回arg1用于生成acw_sc__v2
|
||||
response = session.get(
|
||||
'https://we.51job.com/api/job/search-pc',
|
||||
params=params,
|
||||
headers=headers,
|
||||
verify=False,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# 检查是否需要处理acw_sc__v2反爬虫
|
||||
if 'arg1' in response.text:
|
||||
# 提取arg1
|
||||
arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
|
||||
if arg1_match:
|
||||
arg1 = arg1_match[0]
|
||||
# 生成acw_sc__v2
|
||||
acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
|
||||
if acw_sc__v2:
|
||||
# 生成guid
|
||||
guid = str(uuid.uuid4()).replace("-", "")
|
||||
cookies = {
|
||||
'guid': guid,
|
||||
'acw_sc__v2': acw_sc__v2
|
||||
}
|
||||
|
||||
# 第二次请求,带上cookies
|
||||
response2 = session.get(
|
||||
'https://we.51job.com/api/job/search-pc',
|
||||
params=params,
|
||||
headers=headers,
|
||||
cookies=cookies,
|
||||
verify=False,
|
||||
timeout=30
|
||||
)
|
||||
# 更新cookies
|
||||
cookies.update(response2.cookies.get_dict())
|
||||
|
||||
# 第三次请求,使用完整的cookies
|
||||
response = session.get(
|
||||
'https://we.51job.com/api/job/search-pc',
|
||||
params=params,
|
||||
headers=headers,
|
||||
cookies=cookies,
|
||||
verify=False,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# 解析响应
|
||||
if response.status_code == 200:
|
||||
try:
|
||||
data = response.json()
|
||||
# print(data)
|
||||
if data.get('status') == '1' and 'resultbody' in data:
|
||||
resultbody = data['resultbody']
|
||||
if 'job' in resultbody and 'items' in resultbody['job']:
|
||||
items = resultbody['job']['items']
|
||||
|
||||
# 遍历所有职位,查找匹配的公司
|
||||
for item in items:
|
||||
print(item)
|
||||
full_company_name = item.get('fullCompanyName', '').strip()
|
||||
if full_company_name == keyword.strip():
|
||||
return {
|
||||
'fullCompanyName': full_company_name,
|
||||
'companyName': item.get('companyName', '').strip(),
|
||||
'companyHref': item.get('companyHref', '').strip()
|
||||
}
|
||||
except json.JSONDecodeError:
|
||||
print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
|
||||
return None
|
||||
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"[错误] 请求失败: {e}")
|
||||
import traceback
|
||||
print(traceback.format_exc())
|
||||
return None
|
||||
|
||||
|
||||
def parse_json_company_desc(uri: str) -> dict:
|
||||
"""解析HTML页面,返回字典格式(不使用pandas)
|
||||
|
||||
Args:
|
||||
uri: 页面URL
|
||||
|
||||
Returns:
|
||||
包含location和company_desc的字典
|
||||
"""
|
||||
"https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
|
||||
sy = uri.split("/")[-1].replace(".html", "")
|
||||
# print(sy)
|
||||
if sy.startswith("co"):
|
||||
cid = sy.replace("co", "")
|
||||
else:
|
||||
cid = sy
|
||||
signer = SignGenerator()
|
||||
generate_company_detail_info = signer.generate_company_detail(cid)
|
||||
headers = {
|
||||
'Host': 'cupid.51job.com',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'sign': generate_company_detail_info["signature"],
|
||||
'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
|
||||
'From-Domain': '51job_web',
|
||||
'account-id': '',
|
||||
'user-token': '',
|
||||
'partner': '',
|
||||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
|
||||
'Origin': 'https://jobs.51job.com',
|
||||
'Connection': 'keep-alive',
|
||||
'Referer': 'https://jobs.51job.com/',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-site',
|
||||
'TE': 'trailers',
|
||||
}
|
||||
|
||||
try:
|
||||
# 使用已配置的代理发送请求
|
||||
desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job×tamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
|
||||
res = requests.get(url=desc_url, headers=headers, verify=False)
|
||||
# print(res.text)
|
||||
if not res:
|
||||
return {"company_desc": "请求失败", "company_location": "请求失败"}
|
||||
company_dinfo = res.json()
|
||||
print(company_dinfo)
|
||||
# print(company_dinfo["resultbody"])
|
||||
|
||||
coinfo = company_dinfo["resultbody"]["coinfo"]
|
||||
return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
|
||||
|
||||
except Exception as e:
|
||||
print(f"解析HTML失败: {e}")
|
||||
return {"company_desc": "解析失败", "company_location": "解析失败"}
|
||||
|
||||
|
||||
# 使用示例
|
||||
if __name__ == "__main__":
|
||||
# 测试搜索
|
||||
keyword = "华为技术有限公司"
|
||||
result = search_company(keyword)
|
||||
if result:
|
||||
print(f"找到匹配的公司:")
|
||||
print(f" 全称: {result['fullCompanyName']}")
|
||||
print(f" 简称: {result['companyName']}")
|
||||
print(f" 链接: {result['companyHref']}")
|
||||
else:
|
||||
print(f"未找到匹配的公司: {keyword}")
|
||||
print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))
|
||||
92
company_spider/qcwy_company/test.py
Normal file
92
company_spider/qcwy_company/test.py
Normal file
@ -0,0 +1,92 @@
|
||||
# -*- coding: UTF-8 -*-
|
||||
import time
|
||||
import uuid
|
||||
import requests
|
||||
import re
|
||||
import json
|
||||
import execjs
|
||||
|
||||
|
||||
headers = {
|
||||
'Accept': 'application/json, text/plain, */*',
|
||||
'Accept-Language': 'zh',
|
||||
'Cache-Control': 'no-cache',
|
||||
'Connection': 'keep-alive',
|
||||
'From-Domain': '51job_web',
|
||||
'Pragma': 'no-cache',
|
||||
'Referer': 'https://we.51job.com/pc/search?keyword=java&searchType=2&sortType=0&metro=',
|
||||
'Sec-Fetch-Dest': 'empty',
|
||||
'Sec-Fetch-Mode': 'cors',
|
||||
'Sec-Fetch-Site': 'same-origin',
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
||||
'account-id': '',
|
||||
'partner': '',
|
||||
'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3Djava%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
|
||||
'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"Windows"',
|
||||
'sign': '839932c059141791d8a003f0e6652e14facbf788a502df374fecf9c107d93b9e',
|
||||
'user-token': '',
|
||||
'uuid': '1687228791235576552',
|
||||
}
|
||||
params = {
|
||||
'api_key': '51job',
|
||||
'timestamp': '1769139097',
|
||||
'keyword': '华为技术有限公司',
|
||||
'searchType': '2',
|
||||
'function': '',
|
||||
'industry': '',
|
||||
'jobArea': '000000',
|
||||
'jobArea2': '',
|
||||
'landmark': '',
|
||||
'metro': '',
|
||||
'salary': '',
|
||||
'workYear': '',
|
||||
'degree': '',
|
||||
'companyType': '',
|
||||
'companySize': '',
|
||||
'jobType': '',
|
||||
'issueDate': '',
|
||||
'sortType': '0',
|
||||
'pageNum': '1',
|
||||
'requestId': '',
|
||||
'pageSize': '20',
|
||||
'source': '1',
|
||||
'accountId': '',
|
||||
'pageCode': 'sou|sou|soulb',
|
||||
'scene':'7'
|
||||
}
|
||||
|
||||
# 换成自己的代理,或者不用,单个ip应该有限制
|
||||
proxies = {
|
||||
"http":"http://xxx",
|
||||
"https":"http://xxxx"
|
||||
}
|
||||
|
||||
|
||||
for i in range(1,2):
|
||||
try:
|
||||
# cookie = {'guid': 'd02dfbabd84858301947663946e1710f'}
|
||||
session = requests.session()
|
||||
print("第%s次请求:" % i)
|
||||
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,verify=False) # 关键:禁用SSL验证)
|
||||
print(response.text[:300])
|
||||
arg1 = re.findall("arg1='(.*?)';",response.text,re.S)[0]
|
||||
print('arg1--->',arg1)
|
||||
guid = str(uuid.uuid4()).replace("-", "")
|
||||
cookie = {'guid': str(guid)}
|
||||
|
||||
with open('04.js', 'r', encoding='utf-8') as f:
|
||||
js = f.read()
|
||||
acw_sc__v2 = execjs.compile(js).call('l', arg1)
|
||||
print('acw_sc__v2-->',acw_sc__v2)
|
||||
cookie.update({"acw_sc__v2": acw_sc__v2})
|
||||
# cookie.update({"acw_sc__v3": "649257ebe376df87b3db6a94c1e5ad37f42f783b"})
|
||||
response2 = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,cookies=cookie,verify=False) #
|
||||
cookie.update(response2.cookies.get_dict())
|
||||
|
||||
response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers, cookies=cookie,verify=False)
|
||||
print(response.text)
|
||||
time.sleep(0.5)
|
||||
except Exception as e:
|
||||
print(e)
|
||||
1
company_spider/zhilianzhaopin_company/__init__.py
Normal file
1
company_spider/zhilianzhaopin_company/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# 智联招聘
|
||||
27247
company_spider/zhilianzhaopin_company/city.json
Normal file
27247
company_spider/zhilianzhaopin_company/city.json
Normal file
File diff suppressed because it is too large
Load Diff
68
company_spider/zhilianzhaopin_company/searcc_kw.py
Normal file
68
company_spider/zhilianzhaopin_company/searcc_kw.py
Normal file
@ -0,0 +1,68 @@
|
||||
import math
|
||||
import copy
|
||||
|
||||
R = "0123456789ABCDEFGHIJKLMNOPQRSTUV"
|
||||
|
||||
|
||||
def a(e: str) -> str:
|
||||
t = int(e, 2)
|
||||
return R[t]
|
||||
|
||||
|
||||
def n(e: str) -> str:
|
||||
t = ""
|
||||
|
||||
# 等价 charCodeAt + 16bit
|
||||
for ch in e:
|
||||
t += format(ord(ch), "016b")
|
||||
|
||||
# padEnd 到 5 的倍数
|
||||
pad_len = 5 * math.ceil(len(t) / 5)
|
||||
t = t.ljust(pad_len, "0")
|
||||
|
||||
r = ""
|
||||
for i in range(0, len(t), 5):
|
||||
r += a(t[i:i + 5])
|
||||
|
||||
return r
|
||||
|
||||
|
||||
def generate_url(e: dict) -> str:
|
||||
t = []
|
||||
o = copy.deepcopy(e)
|
||||
|
||||
if o.get("jl"):
|
||||
t.append(f"jl{o['jl']}")
|
||||
del o["jl"]
|
||||
|
||||
if o.get("jt"):
|
||||
t.append(f"jt{o['jt']}")
|
||||
del o["jt"]
|
||||
|
||||
if o.get("in"):
|
||||
t.append(f"in{o['in']}")
|
||||
del o["in"]
|
||||
|
||||
if o.get("kw"):
|
||||
t.append(f"kw{n(o['kw'])}")
|
||||
del o["kw"]
|
||||
|
||||
if o.get("p"):
|
||||
t.append(f"p{o['p']}")
|
||||
del o["p"]
|
||||
|
||||
r = []
|
||||
for key, value in o.items():
|
||||
if value:
|
||||
r.append(f"{key}={value}")
|
||||
|
||||
a_path = "/".join(t)
|
||||
if r:
|
||||
a_path += "?" + "&".join(r)
|
||||
|
||||
return a_path
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
url = f"https://www.zhaopin.com/sou/{generate_url({'jl': 530, 'kw': 'app推广经理'})}"
|
||||
print(url)
|
||||
284
company_spider/zhilianzhaopin_company/spider.py
Normal file
284
company_spider/zhilianzhaopin_company/spider.py
Normal file
@ -0,0 +1,284 @@
|
||||
from playwright.sync_api import sync_playwright, BrowserContext, Page
|
||||
import time
|
||||
import json
|
||||
import os
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from company_spider.zhilianzhaopin_company.searcc_kw import generate_url
|
||||
|
||||
|
||||
class CityLoader:
|
||||
_instance = None
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if not cls._instance:
|
||||
cls._instance = super(CityLoader, cls).__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
def __init__(self, city_file="city.json"):
|
||||
if hasattr(self, 'city_map'):
|
||||
return
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
self.file_path = os.path.join(current_dir, city_file)
|
||||
self.city_map = {}
|
||||
self._load_cities()
|
||||
|
||||
def _load_cities(self):
|
||||
if not os.path.exists(self.file_path):
|
||||
print(f"City file not found: {self.file_path}")
|
||||
return
|
||||
|
||||
try:
|
||||
with open(self.file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self._parse_city_data(data.get("allCity", []))
|
||||
except Exception as e:
|
||||
print(f"Error loading city file: {e}")
|
||||
|
||||
def _parse_city_data(self, cities):
|
||||
for city in cities:
|
||||
self.city_map[city['name']] = city['code']
|
||||
if 'sublist' in city and city['sublist']:
|
||||
self._parse_city_data(city['sublist'])
|
||||
|
||||
def get_code(self, city_name):
|
||||
return self.city_map.get(city_name)
|
||||
|
||||
|
||||
def get_companies_from_page(page: Page) -> List[Dict[str, str]]:
|
||||
"""从搜索结果页面获取公司名称和链接"""
|
||||
companies = []
|
||||
|
||||
# 尝试多种选择器来定位公司名称
|
||||
company_selectors = [
|
||||
'a[class*="company"]',
|
||||
'.company-name a',
|
||||
'a.company-name',
|
||||
'[class*="CompanyName"] a',
|
||||
'a[href*="/company/"]'
|
||||
]
|
||||
|
||||
company_elements = []
|
||||
for selector in company_selectors:
|
||||
try:
|
||||
elements = page.query_selector_all(selector)
|
||||
if elements:
|
||||
company_elements = elements
|
||||
print(f"使用选择器找到 {len(elements)} 个元素: {selector}")
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# 如果没找到,尝试更通用的方法
|
||||
if not company_elements:
|
||||
all_links = page.query_selector_all('a[href*="company"]')
|
||||
company_elements = all_links
|
||||
print(f"通过通用方法找到 {len(all_links)} 个公司链接")
|
||||
|
||||
# 提取公司信息
|
||||
company_info_set = set()
|
||||
|
||||
for element in company_elements:
|
||||
try:
|
||||
company_name = element.inner_text().strip()
|
||||
company_url = element.get_attribute('href')
|
||||
|
||||
if company_name and company_url:
|
||||
# 处理相对路径
|
||||
if company_url.startswith('/'):
|
||||
company_url = f"https://www.zhaopin.com{company_url}"
|
||||
elif not company_url.startswith('http'):
|
||||
company_url = f"https://www.zhaopin.com/{company_url}"
|
||||
|
||||
# 去重
|
||||
if company_name not in company_info_set:
|
||||
company_info_set.add(company_name)
|
||||
companies.append({
|
||||
'name': company_name,
|
||||
'url': company_url
|
||||
})
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
return companies
|
||||
|
||||
|
||||
def get_company_intro(context: BrowserContext, company_url: str) -> str:
|
||||
"""获取公司详情简介"""
|
||||
try:
|
||||
company_page = context.new_page()
|
||||
company_page.goto(company_url, wait_until="networkidle", timeout=30000)
|
||||
time.sleep(2)
|
||||
|
||||
# 尝试多种选择器获取公司简介
|
||||
intro_selectors = [
|
||||
'.company-intro',
|
||||
'.company-description',
|
||||
'[class*="intro"]',
|
||||
'[class*="description"]',
|
||||
'.company-info',
|
||||
'[class*="CompanyIntro"]'
|
||||
]
|
||||
|
||||
company_intro = ""
|
||||
for selector in intro_selectors:
|
||||
try:
|
||||
intro_element = company_page.query_selector(selector)
|
||||
if intro_element:
|
||||
company_intro = intro_element.inner_text().strip()
|
||||
if company_intro:
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
# 如果还是没找到,尝试获取页面主要内容
|
||||
if not company_intro:
|
||||
try:
|
||||
body = company_page.query_selector('body')
|
||||
if body:
|
||||
all_text = body.inner_text()
|
||||
company_intro = all_text[:500]
|
||||
except:
|
||||
pass
|
||||
|
||||
company_page.close()
|
||||
return company_intro if company_intro else "未找到公司简介"
|
||||
|
||||
except Exception as e:
|
||||
return f"获取失败: {str(e)}"
|
||||
|
||||
|
||||
def crawl_companies(params: Dict, max_companies: int = 10, headless: bool = False, proxy: Optional[str] = None) -> List[Dict]:
|
||||
"""
|
||||
爬取智联招聘公司信息
|
||||
|
||||
Args:
|
||||
params: 搜索参数,如 {'jl': 530, 'kw': 'app推广经理'} 或 {'city': '北京', 'kw': '...'}
|
||||
max_companies: 最多爬取的公司数量,默认10
|
||||
headless: 是否无头模式,默认False
|
||||
proxy: 代理地址,例如 "http://user:pass@host:port"
|
||||
|
||||
Returns:
|
||||
公司信息列表,每个元素包含 name, url, intro
|
||||
如果找到完全匹配的公司名称,只返回该公司的信息(列表长度为1)
|
||||
"""
|
||||
# 处理城市名称转代码
|
||||
if 'city' in params and 'jl' not in params:
|
||||
city_loader = CityLoader()
|
||||
code = city_loader.get_code(params['city'])
|
||||
if code:
|
||||
print(f"城市 '{params['city']}' 映射代码为: {code}")
|
||||
params['jl'] = code
|
||||
else:
|
||||
print(f"未找到城市 '{params['city']}' 的代码")
|
||||
|
||||
target_company = params.get('kw', '').strip()
|
||||
|
||||
with sync_playwright() as p:
|
||||
launch_args = ["--disable-blink-features=AutomationControlled"]
|
||||
browser_kwargs = {
|
||||
"headless": headless,
|
||||
"args": launch_args
|
||||
}
|
||||
|
||||
# 尝试使用本地Chrome,如果不存在则使用默认浏览器
|
||||
chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
|
||||
if os.path.exists(chrome_path):
|
||||
browser_kwargs["executable_path"] = chrome_path
|
||||
|
||||
if proxy:
|
||||
browser_kwargs["proxy"] = {"server": proxy}
|
||||
print(f"使用代理: {proxy}")
|
||||
|
||||
browser = p.chromium.launch(**browser_kwargs)
|
||||
|
||||
context = browser.new_context(
|
||||
viewport={"width": 1920, "height": 1080},
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
page = context.new_page()
|
||||
|
||||
# 生成URL并访问搜索页面
|
||||
url = f"https://www.zhaopin.com/sou/{generate_url(params)}"
|
||||
print(f"访问URL: {url}")
|
||||
page.goto(url, wait_until="networkidle", timeout=30000)
|
||||
time.sleep(3)
|
||||
|
||||
# 获取公司列表
|
||||
companies = get_companies_from_page(page)
|
||||
print(f"\n找到 {len(companies)} 家公司")
|
||||
|
||||
# 如果有关键词,尝试精确匹配公司名称
|
||||
if target_company:
|
||||
print(f"搜索目标公司: {target_company}")
|
||||
for company in companies:
|
||||
company_name = company['name'].strip()
|
||||
# 优先精确匹配,如果精确匹配失败则尝试包含匹配
|
||||
if company_name == target_company:
|
||||
print(f"找到完全匹配的公司: {company_name}")
|
||||
print(f"正在获取公司简介...")
|
||||
company_intro = get_company_intro(context, company['url'])
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
return [{
|
||||
'name': company_name,
|
||||
'url': company['url'],
|
||||
'intro': company_intro
|
||||
}]
|
||||
|
||||
# 如果精确匹配失败,尝试包含匹配
|
||||
for company in companies:
|
||||
company_name = company['name'].strip()
|
||||
if target_company in company_name or company_name in target_company:
|
||||
print(f"找到部分匹配的公司: {company_name}")
|
||||
print(f"正在获取公司简介...")
|
||||
company_intro = get_company_intro(context, company['url'])
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
return [{
|
||||
'name': company_name,
|
||||
'url': company['url'],
|
||||
'intro': company_intro
|
||||
}]
|
||||
|
||||
# 如果没有找到匹配的公司,按原逻辑获取多家公司
|
||||
print(f"未找到完全匹配的公司,获取前 {max_companies} 家公司信息")
|
||||
results = []
|
||||
for i, company in enumerate(companies[:max_companies], 1):
|
||||
print(f"\n[{i}/{min(max_companies, len(companies))}] 正在获取: {company['name']}")
|
||||
company_intro = get_company_intro(context, company['url'])
|
||||
|
||||
results.append({
|
||||
'name': company['name'],
|
||||
'url': company['url'],
|
||||
'intro': company_intro
|
||||
})
|
||||
|
||||
time.sleep(1)
|
||||
|
||||
context.close()
|
||||
browser.close()
|
||||
|
||||
return results
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 测试代码
|
||||
params = {'city': '北京', 'kw': 'app推广经理'}
|
||||
results = crawl_companies(params, max_companies=10)
|
||||
|
||||
# 输出结果
|
||||
print("\n" + "="*80)
|
||||
print("爬取结果:")
|
||||
print("="*80)
|
||||
for result in results:
|
||||
print(f"\n公司名称: {result['name']}")
|
||||
print(f"公司链接: {result['url']}")
|
||||
print(f"公司简介: {result['intro'][:200]}..." if len(result['intro']) > 200 else f"公司简介: {result['intro']}")
|
||||
print("-"*80)
|
||||
6
jobs_spider/qcwy/crawl_stats.log
Normal file
6
jobs_spider/qcwy/crawl_stats.log
Normal file
@ -0,0 +1,6 @@
|
||||
{"timestamp": "2026-01-15 00:38:02", "total_crawled": 517, "unique_count": 503, "duplicate_count": 14, "api_total_count": 505, "job_area": "商丘", "function_type": "8305"}
|
||||
{"timestamp": "2026-01-15 01:36:23", "total_crawled": 517, "unique_count": 509, "duplicate_count": 8, "api_total_count": 517, "job_area": "广安", "function_type": "1318"}
|
||||
{"timestamp": "2026-01-15 02:32:36", "total_crawled": 517, "unique_count": 511, "duplicate_count": 6, "api_total_count": 517, "job_area": "阜阳", "function_type": "6101"}
|
||||
{"timestamp": "2026-01-15 03:32:52", "total_crawled": 517, "unique_count": 513, "duplicate_count": 4, "api_total_count": 517, "job_area": "常德", "function_type": "3812"}
|
||||
{"timestamp": "2026-01-15 04:31:42", "total_crawled": 517, "unique_count": 510, "duplicate_count": 7, "api_total_count": 517, "job_area": "惠州", "function_type": "3335"}
|
||||
{"timestamp": "2026-01-15 05:28:54", "total_crawled": 517, "unique_count": 515, "duplicate_count": 2, "api_total_count": 517, "job_area": "锦州", "function_type": "0154"}
|
||||
8840
jobs_spider/qcwy/logs/log_2026-01-15.log
Normal file
8840
jobs_spider/qcwy/logs/log_2026-01-15.log
Normal file
File diff suppressed because it is too large
Load Diff
@ -25,7 +25,7 @@ os.makedirs("logs", exist_ok=True)
|
||||
logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)
|
||||
|
||||
|
||||
API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
|
||||
API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')
|
||||
|
||||
API_PUBLIC_HOST = os.getenv('API_PUBLIC_HOST')
|
||||
PROXY_URL = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user