up

2026-01-24 17:07:34 +08:00 · 2026-01-24 17:07:34 +08:00 · 3d7e96845d
commit 3d7e96845d
parent 7285475eb5
21 changed files with 43687 additions and 1 deletions
--- a/2
+++ b/2
@ -63,6 +63,8 @@ uvicorn = "==0.34.0"
 uvloop = "==0.21.0"
 watchfiles = "==1.0.4"
 websockets = "==14.1"
+PyExecJS = "==1.5.1"
+playwright = "==1.57.0"
 asyncpg = "*"
 pandas = "*"
 openpyxl = "*"
--- a/app/api/v1/init.py
+++ b/app/api/v1/init.py
@ -17,6 +17,7 @@ from .pipeline import pipeline_router
 from .keyword import keyword_router
 from .cleaning import cleaning_router
 from .analytics import router as analytics_router
+from .company import company_router


 v1_router = APIRouter()
@ -37,3 +38,4 @@ v1_router.include_router(pipeline_router, prefix="/pipeline")
 v1_router.include_router(keyword_router, prefix="/keyword")
 v1_router.include_router(cleaning_router, prefix="/cleaning", dependencies=[DependPermission])
 v1_router.include_router(analytics_router, prefix="/analytics", tags=["数据分析"])
+v1_router.include_router(company_router, prefix="/company", tags=["公司搜索"])
--- a/app/api/v1/company/init.py
+++ b/app/api/v1/company/init.py
@ -0,0 +1,3 @@
+from .company import router as company_router
+
+__all__ = ["company_router"]
--- a/app/api/v1/company/company.py
+++ b/app/api/v1/company/company.py
@ -0,0 +1,47 @@
+from typing import Any, Dict, Optional
+
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel, Field
+
+from app.controllers.company import CompanyController, create_company_controller
+from app.schemas.base import Fail, Success
+
+
+router = APIRouter(tags=["公司搜索"])
+
+
+class ZhilianSearchRequest(BaseModel):
+    keyword: str = Field(..., description="公司名称关键词")
+    city: Optional[str] = Field(None, description="城市名称，如北京")
+
+
+class QcwySearchRequest(BaseModel):
+    keyword: str = Field(..., description="公司名称关键词")
+
+
+async def get_company_controller() -> CompanyController:
+    return create_company_controller()
+
+
+@router.post("/zhilian/search", summary="智联招聘公司搜索")
+async def zhilian_search_company(
+    request: ZhilianSearchRequest,
+    controller: CompanyController = Depends(get_company_controller),
+) -> Dict[str, Any]:
+    try:
+        data = await controller.search_zhilian_company(request.keyword, request.city)
+        return Success(data=data)
+    except Exception as exc:
+        return Fail(msg=f"智联公司搜索失败: {exc}")
+
+
+@router.post("/qcwy/search", summary="前程无忧公司搜索")
+async def qcwy_search_company(
+    request: QcwySearchRequest,
+    controller: CompanyController = Depends(get_company_controller),
+) -> Dict[str, Any]:
+    try:
+        data = await controller.search_qcwy_company(request.keyword)
+        return Success(data=data)
+    except Exception as exc:
+        return Fail(msg=f"前程无忧公司搜索失败: {exc}")
--- a/app/controllers/company.py
+++ b/app/controllers/company.py
@ -0,0 +1,20 @@
+import asyncio
+from typing import Any, Dict, List, Optional
+
+from company_spider.qcwy_company.spider import search_company as qcwy_search_company
+from company_spider.zhilianzhaopin_company.spider import crawl_companies
+
+
+class CompanyController:
+    async def search_qcwy_company(self, keyword: str) -> Optional[Dict[str, Any]]:
+        return await asyncio.to_thread(qcwy_search_company, keyword)
+
+    async def search_zhilian_company(self, keyword: str, city: Optional[str] = None) -> List[Dict[str, Any]]:
+        params = {"kw": keyword}
+        if city:
+            params["city"] = city
+        return await asyncio.to_thread(crawl_companies, params, 10)
+
+
+def create_company_controller() -> CompanyController:
+    return CompanyController()
--- a/company_spider/Dockerfile
+++ b/company_spider/Dockerfile
@ -0,0 +1,46 @@
+FROM python:3.11-slim
+
+# 配置 apt-get 使用阿里云镜像源
+RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources || \
+    sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list || true
+
+# Install system dependencies
+# Node.js is required for PyExecJS
+RUN apt-get update && apt-get install -y \
+    nodejs \
+    npm \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# 配置 npm 使用淘宝镜像源
+RUN npm config set registry https://registry.npmmirror.com
+
+WORKDIR /app
+
+# 配置 pip 使用国内镜像源（创建配置文件）
+RUN mkdir -p /root/.pip && \
+    echo '[global]' > /root/.pip/pip.conf && \
+    echo 'index-url = https://pypi.tuna.tsinghua.edu.cn/simple' >> /root/.pip/pip.conf && \
+    echo 'trusted-host = pypi.tuna.tsinghua.edu.cn' >> /root/.pip/pip.conf && \
+    echo 'timeout = 120' >> /root/.pip/pip.conf
+
+# Copy requirements first to leverage cache
+COPY requirements.txt .
+
+# 使用配置的镜像源安装依赖
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Install Playwright browsers and system dependencies
+# We only need chromium for this project
+# 配置 Playwright 使用国内镜像
+ENV PLAYWRIGHT_DOWNLOAD_HOST=https://npmmirror.com/mirrors/playwright
+RUN playwright install chromium
+RUN playwright install-deps chromium
+
+COPY . .
+
+# Expose the port
+EXPOSE 8000
+
+# Run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/company_spider/init.py
+++ b/company_spider/init.py
--- a/company_spider/demo
+++ b/company_spider/demo
@ -0,0 +1,12 @@
+curl --location 'http://127.0.0.1:9999/api/v1/company/qcwy/search' \
+--header 'Content-Type: application/json' \
+--data '{
+  "keyword": "中信期货有限公司"
+}'
+
+curl --location 'http://127.0.0.1:9999/api/v1/company/zhilian/search' \
+--header 'Content-Type: application/json' \
+--data '{
+  "keyword": "中信期货有限公司",
+  "city":"北京"
+}'
--- a/company_spider/qcwy_company/04.js
+++ b/company_spider/qcwy_company/04.js
@ -0,0 +1,65 @@
+window = {};
+
+var arg3 = null;
+var arg4 = null;
+var arg5 = null;
+var arg6 = null;
+var arg7 = null;
+var arg8 = null;
+var arg9 = null;
+var arg10 = null;
+
+var l = function (arg1) {
+    while (window["_phantom"] || window["__phantomas"]) {
+    }
+
+    var _0x5e8b26 = "3000176000856006061501533003690027800375";
+
+    String["prototype"]["hexXor"] = function (_0x4e08d8) {
+        var _0x5a5d3b = "";
+
+        for (var _0xe89588 = 0; _0xe89588 < this["length"] && _0xe89588 < _0x4e08d8["length"]; _0xe89588 += 2) {
+            var _0x401af1 = parseInt(this["slice"](_0xe89588, _0xe89588 + 2), 16);
+
+            var _0x105f59 = parseInt(_0x4e08d8["slice"](_0xe89588, _0xe89588 + 2), 16);
+
+            var _0x189e2c = (_0x401af1 ^ _0x105f59)["toString"](16);
+
+            if (_0x189e2c["length"] == 1) {
+                _0x189e2c = "0" + _0x189e2c;
+            }
+
+            _0x5a5d3b += _0x189e2c;
+        }
+
+        return _0x5a5d3b;
+    };
+
+    String["prototype"]["unsbox"] = function () {
+        var _0x4b082b = [15, 35, 29, 24, 33, 16, 1, 38, 10, 9, 19, 31, 40, 27, 22, 23, 25, 13, 6, 11, 39, 18, 20, 8, 14, 21, 32, 26, 2, 30, 7, 4, 17, 5, 3, 28, 34, 37, 12, 36];
+        var _0x4da0dc = [];
+        var _0x12605e = "";
+
+        for (var _0x20a7bf = 0; _0x20a7bf < this["length"]; _0x20a7bf++) {
+            var _0x385ee3 = this[_0x20a7bf];
+
+            for (var _0x217721 = 0; _0x217721 < _0x4b082b["length"]; _0x217721++) {
+                if (_0x4b082b[_0x217721] == _0x20a7bf + 1) {
+                    _0x4da0dc[_0x217721] = _0x385ee3;
+                }
+            }
+        }
+
+        _0x12605e = _0x4da0dc["join"]("");
+        return _0x12605e;
+    };
+
+    var _0x23a392 = arg1["unsbox"]();
+
+    arg2 = _0x23a392["hexXor"](_0x5e8b26);
+    console.log('arg2==>', arg2)
+    // setTimeout("reload(arg2)", 2);
+    return arg2
+};
+// var arg1 = "FAA6CB46CF724D58FF82E5310687947623413114";
+// l(arg1)
--- a/company_spider/qcwy_company/init.py
+++ b/company_spider/qcwy_company/init.py
--- a/company_spider/qcwy_company/out.json
+++ b/company_spider/qcwy_company/out.json
--- a/company_spider/qcwy_company/qcwy.js
+++ b/company_spider/qcwy_company/qcwy.js
@ -0,0 +1,91 @@
+import crypto from 'crypto';
+
+// 使用 crypto
+const hmacSHA256 = (message, key) => {
+    return crypto
+        .createHmac('sha256', key)
+        .update(message)
+        .digest('hex');
+};
+
+function A(t) {
+    console.log(t.url)
+    console.log(t.data)
+    var e = t.url + (t.data || "")
+    return hmacSHA256(e, "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b")
+}
+
+
+function a(e) {
+    for (var t = 1; t < arguments.length; t++) {
+        var n = null != arguments[t] ? arguments[t] : {};
+        t % 2 ? r(Object(n), !0).forEach((function (t) {
+                Object(i["a"])(e, t, n[t])
+            }
+        )) : Object.getOwnPropertyDescriptors ? Object.defineProperties(e, Object.getOwnPropertyDescriptors(n)) : r(Object(n)).forEach((function (t) {
+                Object.defineProperty(e, t, Object.getOwnPropertyDescriptor(n, t))
+            }
+        ))
+    }
+    return e
+}
+
+var t = {
+    "transitional": {"silentJSONParsing": true, "forcedJSONParsing": true, "clarifyTimeoutError": false},
+    "transformRequest": [null],
+    "transformResponse": [null],
+    "timeout": 30000,
+    "xsrfCookieName": "XSRF-TOKEN",
+    "xsrfHeaderName": "X-XSRF-TOKEN",
+    "maxContentLength": -1,
+    "maxBodyLength": -1,
+    "headers": {
+        "common": {"Accept": "application/json, text/plain, */*"},
+        "delete": {},
+        "get": {"Content-Type": "application/x-www-form-urlencoded"},
+        "head": {},
+        "post": {"Content-Type": "application/json"},
+        "put": {"Content-Type": "application/x-www-form-urlencoded"},
+        "patch": {"Content-Type": "application/x-www-form-urlencoded"}
+    },
+    "baseURL": "https://we.51job.com",
+    "withCredentials": true,
+    "url": "/api/job/search-pc?api_key=51job&timestamp=1769136341&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&function=&industry=&jobArea=010000&jobArea2=&landmark=&metro=&salary=&workYear=&degree=&companyType=&companySize=&jobType=&issueDate=&sortType=0&pageNum=1&requestId=&keywordType=&pageSize=20&source=1&accountId=&pageCode=sou%7Csou%7Csoulb&scene=7",
+    "method": "get",
+    "property": {"keywordType": ""}
+};
+
+var b = {
+    "partner": "cn_bing_com",
+    "webId": 2,
+    "fromdomain": "51job_web",
+    "frompageUrl": "https://we.51job.com/",
+    "pageUrl": "https://we.51job.com/pc/search?jobArea=010000&keyword=%E5%8D%8E%E4%B8%BA%E6%8A%80%E6%9C%AF%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8&searchType=2&keywordType=",
+    "identityType": "",
+    "userType": "",
+    "isLogin": "否",
+    "accountid": ""
+}
+console.log(A(t));
+
+// function wordsToHex(words) {
+//     // CryptoJS 使用 32 位有符号整数存储，需要处理
+//     let hex = '';
+//     for (let i = 0; i < words.length; i++) {
+//         // 将负数转换为无符号整数
+//         const word = words[i] >>> 0;
+//         // 转换为十六进制并补零
+//         hex += word.toString(16).padStart(8, '0');
+//     }
+//     return hex;
+// }
+//
+// // 你的数据
+// const words = [-762966511, 1702028048, 94455509, -201850815,
+//     300412866, 1405396681, 85275542, 246713406];
+//
+// console.log(wordsToHex(words))
+
+/*
+* sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%7D%2C%22identities%22%3A%22eyIkaWRlbnRpdHlfY29va2llX2lkIjoiMTliZThkNzFmODIxM2YxLTBmZDliOTEwODEzYWE1OC00YzY1N2I1OC0zNjg2NDAwLTE5YmU4ZDcxZjgzMTcxNiJ9%22%2C%22history_login_id%22%3A%7B%22name%22%3A%22%22%2C%22value%22%3A%22%22%7D%2C%22%24device_id%22%3A%2219be8d71f8213f1-0fd9b910813aa58-4c657b58-3686400-19be8d71f831716%22%7D; ssxmod_itna=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2Yea0xGXKwDA5DnCx7YDt=RcwxK06dvxK=W0mitswe6uDuYLP2GGRRgW_GlDMEHLA6C5N7qxDHwd4KxGLDY=DCqxq57eD4f3Dt4DIDAYDDxDWDYEPxGUQDG=D7rTi5pWtxi3DboaDmd2WC=FD03q=EWFoDDtAbeG2bETqDDNqF9G3_lh3_PD_bW9QKtWemFxPneDMbxGX7YCqnlH2oyDWpFkUsao3xB=gxBQbyPnhwETadZanDY4lGrWYY2DIjGxWxiGG1i05Q03nwsWmwlG1Gv_GDxhw4SrUDDAt_hWDHBRqW0tK2lj5/bc_9yYtAbYW=LgrRxxWqqRAOIotBhNi47fD5=4qf0esGthu5oiPeD; ssxmod_itna2=1-Gqmx0DuD2Dc0D=Ni73itD2Dp=DmpsKeQDzxCH9P0CCDLxn4xGdY2=Cw3DyD7Tp4RG5DaQi2YeaKxDfrQfQGh4qBFjq03_jSefWDlO03BqKSSfAaeFuhD2y0F5nKj4LMzWF2qLViLAjiLzGteYAj1KAULY4hzS3=uiLiHAktq7AQK04=RCrN4_lNnDaNPYDr4nhTEjfu/3d5Fcwil7pUxfDu7yjj5TT0UnkUbM4F0FALQk19oO64i1g2QsibdzqxtPn8oOB3wpj5FVm6R_LF2EKxZIWFfaGt9oNT4U_0IjQx40hUsUKLNOBzuR1Mh=_gTlLdLS53B3OE4dGDB8GdjhOf4MYhuE37oTUMtTCwOOD7WhhjwgohMumFghOCNeDxRqr92NTeIRW=oOeThvw7DBG5/DoFShd7v5ZxwYEKiDD
+* */
--- a/company_spider/qcwy_company/spider.py
+++ b/company_spider/qcwy_company/spider.py
@ -0,0 +1,327 @@
+import hashlib
+import hmac
+import json
+import execjs
+import re
+import time
+import uuid
+import requests
+from urllib.parse import unquote, quote
+from typing import Optional, Dict
+import os
+
+
+class SignGenerator:
+    def __init__(self):
+        # 签名密钥（从JS代码中获取）
+        self.secret_key = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
+        self.secret_key_bytes = self.secret_key.encode('utf-8')
+
+    def hmac_sha256(self, message, key):
+        """HMAC-SHA256签名"""
+        key_bytes = key.encode('utf-8') if isinstance(key, str) else key
+        message_bytes = message.encode('utf-8') if isinstance(message, str) else message
+
+        signature = hmac.new(key_bytes, message_bytes, hashlib.sha256)
+        return signature.hexdigest()
+
+    def generate_signature(self, t):
+        """
+        生成签名（对应JS中的函数A）
+        JS逻辑: signature = hmacSHA256(url + (data || ""), secret_key)
+        """
+        # 获取URL
+        url = t.get("url", "")
+
+        # 获取data，如果不存在则使用空字符串
+        data = t.get("data", "")
+        if data and isinstance(data, dict):
+            # 如果data是字典，转换为字符串
+            data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
+
+        # 拼接字符串
+        message = url + data
+
+        # 生成签名
+        signature = self.hmac_sha256(message, self.secret_key)
+        return signature
+
+    def generate_signature_from_components(self, url, data=None):
+        """从URL和data生成签名（更简单的接口）"""
+        if data is None:
+            data = ""
+        elif isinstance(data, dict):
+            data = json.dumps(data, ensure_ascii=False, separators=(',', ':'))
+
+        message = url + data
+        return self.hmac_sha256(message, self.secret_key)
+
+    def generate_acw_sc__v2(self, arg1):
+        """生成cookies acw_sc__v2"""
+        # 获取当前文件所在目录
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        js_file_path = os.path.join(current_dir, '04.js')
+        with open(js_file_path, 'r', encoding='utf-8') as f:
+            js = f.read()
+        acw_sc__v2 = execjs.compile(js).call('l', arg1)
+        return acw_sc__v2 if acw_sc__v2 else None
+
+    def generate_company_detail(self, cid: str) -> dict:
+
+        timestamp = int(time.time())
+        # 待签名的字符串
+        message = f"/open/noauth/company-info/pc-info?api_key=51job&timestamp={timestamp}&encryCompanyId={cid}"
+        secret = "abfc8f9dcf8c3f3d8aa294ac5f2cf2cc7767e5592590f39c3f503271dd68562b"
+        # 进行 HMAC-SHA256 签名
+        signature = hmac.new(
+            key=secret.encode("utf-8"),
+            msg=message.encode("utf-8"),
+            digestmod=hashlib.sha256
+        ).hexdigest()
+        return {"signature": signature, "timestamp": timestamp}
+
+
+def search_company(keyword: str, job_area: str = "000000") -> Optional[Dict]:
+    """
+    根据关键字搜索公司信息
+    
+    Args:
+        keyword: 搜索关键字（公司名称）
+        job_area: 工作区域代码，默认"000000"表示全国
+    
+    Returns:
+        如果找到匹配的公司，返回包含fullCompanyName, companyName, companyHref的字典
+        否则返回None
+    """
+    signer = SignGenerator()
+    session = requests.Session()
+
+    # 生成时间戳
+    timestamp = str(int(time.time()))
+
+    # 构建请求参数
+    params = {
+        'api_key': '51job',
+        'timestamp': timestamp,
+        'keyword': keyword,
+        'searchType': '2',  # 2表示搜索公司
+        'function': '',
+        'industry': '',
+        'jobArea': job_area,
+        'jobArea2': '',
+        'landmark': '',
+        'metro': '',
+        'salary': '',
+        'workYear': '',
+        'degree': '',
+        'companyType': '',
+        'companySize': '',
+        'jobType': '',
+        'issueDate': '',
+        'sortType': '0',
+        'pageNum': '1',
+        'requestId': '',
+        'pageSize': '20',
+        'source': '1',
+        'accountId': '',
+        'pageCode': 'sou|sou|soulb',
+        'scene': '7'
+    }
+
+    # 构建URL用于生成签名
+    # 注意：签名时keyword需要URL编码，其他参数保持原样
+    url_path = '/api/job/search-pc'
+    query_parts = []
+    for k, v in params.items():
+        if v:
+            # keyword参数需要URL编码（与浏览器行为一致）
+            if k == 'keyword':
+                query_parts.append(f'{k}={quote(str(v))}')
+            else:
+                query_parts.append(f'{k}={str(v)}')
+        else:
+            query_parts.append(f'{k}=')
+    query_string = '&'.join(query_parts)
+    full_url = f"{url_path}?{query_string}"
+
+    # 生成签名
+    sign = signer.generate_signature_from_components(full_url)
+
+    # 构建请求头
+    headers = {
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'zh',
+        'Cache-Control': 'no-cache',
+        'Connection': 'keep-alive',
+        'From-Domain': '51job_web',
+        'Pragma': 'no-cache',
+        'Referer': f'https://we.51job.com/pc/search?keyword={quote(keyword)}&searchType=2&sortType=0&metro=',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-origin',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+        'account-id': '',
+        'partner': '',
+        'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3D' + quote(
+            keyword) + '%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
+        'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'sign': sign,
+        'user-token': '',
+        'uuid': str(uuid.uuid4()).replace('-', ''),
+    }
+
+    try:
+        # 第一次请求，可能会返回arg1用于生成acw_sc__v2
+        response = session.get(
+            'https://we.51job.com/api/job/search-pc',
+            params=params,
+            headers=headers,
+            verify=False,
+            timeout=30
+        )
+
+        # 检查是否需要处理acw_sc__v2反爬虫
+        if 'arg1' in response.text:
+            # 提取arg1
+            arg1_match = re.findall(r"arg1='(.*?)';", response.text, re.S)
+            if arg1_match:
+                arg1 = arg1_match[0]
+                # 生成acw_sc__v2
+                acw_sc__v2 = signer.generate_acw_sc__v2(arg1)
+                if acw_sc__v2:
+                    # 生成guid
+                    guid = str(uuid.uuid4()).replace("-", "")
+                    cookies = {
+                        'guid': guid,
+                        'acw_sc__v2': acw_sc__v2
+                    }
+
+                    # 第二次请求，带上cookies
+                    response2 = session.get(
+                        'https://we.51job.com/api/job/search-pc',
+                        params=params,
+                        headers=headers,
+                        cookies=cookies,
+                        verify=False,
+                        timeout=30
+                    )
+                    # 更新cookies
+                    cookies.update(response2.cookies.get_dict())
+
+                    # 第三次请求，使用完整的cookies
+                    response = session.get(
+                        'https://we.51job.com/api/job/search-pc',
+                        params=params,
+                        headers=headers,
+                        cookies=cookies,
+                        verify=False,
+                        timeout=30
+                    )
+
+        # 解析响应
+        if response.status_code == 200:
+            try:
+                data = response.json()
+                # print(data)
+                if data.get('status') == '1' and 'resultbody' in data:
+                    resultbody = data['resultbody']
+                    if 'job' in resultbody and 'items' in resultbody['job']:
+                        items = resultbody['job']['items']
+
+                        # 遍历所有职位，查找匹配的公司
+                        for item in items:
+                            print(item)
+                            full_company_name = item.get('fullCompanyName', '').strip()
+                            if full_company_name == keyword.strip():
+                                return {
+                                    'fullCompanyName': full_company_name,
+                                    'companyName': item.get('companyName', '').strip(),
+                                    'companyHref': item.get('companyHref', '').strip()
+                                }
+            except json.JSONDecodeError:
+                print(f"[错误] 响应不是有效的JSON: {response.text[:200]}")
+                return None
+
+        return None
+
+    except Exception as e:
+        print(f"[错误] 请求失败: {e}")
+        import traceback
+        print(traceback.format_exc())
+        return None
+
+
+def parse_json_company_desc(uri: str) -> dict:
+    """解析HTML页面，返回字典格式（不使用pandas）
+
+    Args:
+        uri: 页面URL
+
+    Returns:
+        包含location和company_desc的字典
+    """
+    "https://jobs.51job.com/all/coUT9QPQdhBzEGY1A1VjQ.html"
+    sy = uri.split("/")[-1].replace(".html", "")
+    # print(sy)
+    if sy.startswith("co"):
+        cid = sy.replace("co", "")
+    else:
+        cid = sy
+    signer = SignGenerator()
+    generate_company_detail_info = signer.generate_company_detail(cid)
+    headers = {
+        'Host': 'cupid.51job.com',
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:140.0) Gecko/20100101 Firefox/140.0',
+        'Accept': 'application/json, text/plain, */*',
+        'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
+        'Accept-Encoding': 'gzip, deflate, br, zstd',
+        'sign': generate_company_detail_info["signature"],
+        'uuid': '1e6151f7bc3ce8d7e526c88d7d6592cd',
+        'From-Domain': '51job_web',
+        'account-id': '',
+        'user-token': '',
+        'partner': '',
+        'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fjobs.51job.com%2Fall%2Fco4194496.html%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountId%22%3A%22%22%2C%22shortPageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%2C%22pageCode%22%3A%22gsxq%7Czwlb%7Cgsxqlb%22%7D',
+        'Origin': 'https://jobs.51job.com',
+        'Connection': 'keep-alive',
+        'Referer': 'https://jobs.51job.com/',
+        'Sec-Fetch-Dest': 'empty',
+        'Sec-Fetch-Mode': 'cors',
+        'Sec-Fetch-Site': 'same-site',
+        'TE': 'trailers',
+    }
+
+    try:
+        # 使用已配置的代理发送请求
+        desc_url = f"https://cupid.51job.com/open/noauth/company-info/pc-info?api_key=51job&timestamp={generate_company_detail_info['timestamp']}&encryCompanyId={cid}"
+        res = requests.get(url=desc_url, headers=headers, verify=False)
+        # print(res.text)
+        if not res:
+            return {"company_desc": "请求失败", "company_location": "请求失败"}
+        company_dinfo = res.json()
+        print(company_dinfo)
+        # print(company_dinfo["resultbody"])
+
+        coinfo = company_dinfo["resultbody"]["coinfo"]
+        return {"company_desc": coinfo["coinfo"], "company_location": coinfo["caddr"],"encryCompanyId":coinfo["encryCompanyId"]}
+
+    except Exception as e:
+        print(f"解析HTML失败: {e}")
+        return {"company_desc": "解析失败", "company_location": "解析失败"}
+
+
+# 使用示例
+if __name__ == "__main__":
+    # 测试搜索
+    keyword = "华为技术有限公司"
+    result = search_company(keyword)
+    if result:
+        print(f"找到匹配的公司:")
+        print(f"  全称: {result['fullCompanyName']}")
+        print(f"  简称: {result['companyName']}")
+        print(f"  链接: {result['companyHref']}")
+    else:
+        print(f"未找到匹配的公司: {keyword}")
+    print(parse_json_company_desc("https://jobs.51job.com/all/coA2RXNgBnATgPaQJn.html"))
--- a/company_spider/qcwy_company/test.py
+++ b/company_spider/qcwy_company/test.py
@ -0,0 +1,92 @@
+# -*- coding: UTF-8 -*-
+import time
+import uuid
+import requests
+import re
+import json
+import execjs
+
+
+headers = {
+    'Accept': 'application/json, text/plain, */*',
+    'Accept-Language': 'zh',
+    'Cache-Control': 'no-cache',
+    'Connection': 'keep-alive',
+    'From-Domain': '51job_web',
+    'Pragma': 'no-cache',
+    'Referer': 'https://we.51job.com/pc/search?keyword=java&searchType=2&sortType=0&metro=',
+    'Sec-Fetch-Dest': 'empty',
+    'Sec-Fetch-Mode': 'cors',
+    'Sec-Fetch-Site': 'same-origin',
+    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
+    'account-id': '',
+    'partner': '',
+    'property': '%7B%22partner%22%3A%22%22%2C%22webId%22%3A2%2C%22fromdomain%22%3A%2251job_web%22%2C%22frompageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2F%22%2C%22pageUrl%22%3A%22https%3A%2F%2Fwe.51job.com%2Fpc%2Fsearch%3Fkeyword%3Djava%26searchType%3D2%26sortType%3D0%26metro%3D%22%2C%22identityType%22%3A%22%22%2C%22userType%22%3A%22%22%2C%22isLogin%22%3A%22%E5%90%A6%22%2C%22accountid%22%3A%22%22%7D',
+    'sec-ch-ua': '"Not.A/Brand";v="8", "Chromium";v="114", "Google Chrome";v="114"',
+    'sec-ch-ua-mobile': '?0',
+    'sec-ch-ua-platform': '"Windows"',
+    'sign': '839932c059141791d8a003f0e6652e14facbf788a502df374fecf9c107d93b9e',
+    'user-token': '',
+    'uuid': '1687228791235576552',
+}
+params = {
+    'api_key': '51job',
+    'timestamp': '1769139097',
+    'keyword': '华为技术有限公司',
+    'searchType': '2',
+    'function': '',
+    'industry': '',
+    'jobArea': '000000',
+    'jobArea2': '',
+    'landmark': '',
+    'metro': '',
+    'salary': '',
+    'workYear': '',
+    'degree': '',
+    'companyType': '',
+    'companySize': '',
+    'jobType': '',
+    'issueDate': '',
+    'sortType': '0',
+    'pageNum': '1',
+    'requestId': '',
+    'pageSize': '20',
+    'source': '1',
+    'accountId': '',
+    'pageCode': 'sou|sou|soulb',
+    'scene':'7'
+}
+
+# 换成自己的代理，或者不用，单个ip应该有限制
+proxies = {
+    "http":"http://xxx",
+    "https":"http://xxxx"
+}
+
+
+for i in range(1,2):
+    try:
+        # cookie = {'guid': 'd02dfbabd84858301947663946e1710f'}
+        session = requests.session()
+        print("第%s次请求：" % i)
+        response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,verify=False)  # 关键：禁用SSL验证)
+        print(response.text[:300])
+        arg1 = re.findall("arg1='(.*?)';",response.text,re.S)[0]
+        print('arg1--->',arg1)
+        guid = str(uuid.uuid4()).replace("-", "")
+        cookie = {'guid': str(guid)}
+
+        with open('04.js', 'r', encoding='utf-8') as f:
+            js = f.read()
+        acw_sc__v2 = execjs.compile(js).call('l', arg1)
+        print('acw_sc__v2-->',acw_sc__v2)
+        cookie.update({"acw_sc__v2": acw_sc__v2})
+        # cookie.update({"acw_sc__v3": "649257ebe376df87b3db6a94c1e5ad37f42f783b"})
+        response2 = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers,cookies=cookie,verify=False) #
+        cookie.update(response2.cookies.get_dict())
+
+        response = session.get('https://we.51job.com/api/job/search-pc', params=params,headers=headers, cookies=cookie,verify=False)
+        print(response.text)
+        time.sleep(0.5)
+    except Exception as e:
+        print(e)
--- a/company_spider/zhilianzhaopin_company/init.py
+++ b/company_spider/zhilianzhaopin_company/init.py
@ -0,0 +1 @@
+# 智联招聘
--- a/company_spider/zhilianzhaopin_company/city.json
+++ b/company_spider/zhilianzhaopin_company/city.json
--- a/company_spider/zhilianzhaopin_company/searcc_kw.py
+++ b/company_spider/zhilianzhaopin_company/searcc_kw.py
@ -0,0 +1,68 @@
+import math
+import copy
+
+R = "0123456789ABCDEFGHIJKLMNOPQRSTUV"
+
+
+def a(e: str) -> str:
+    t = int(e, 2)
+    return R[t]
+
+
+def n(e: str) -> str:
+    t = ""
+
+    # 等价 charCodeAt + 16bit
+    for ch in e:
+        t += format(ord(ch), "016b")
+
+    # padEnd 到 5 的倍数
+    pad_len = 5 * math.ceil(len(t) / 5)
+    t = t.ljust(pad_len, "0")
+
+    r = ""
+    for i in range(0, len(t), 5):
+        r += a(t[i:i + 5])
+
+    return r
+
+
+def generate_url(e: dict) -> str:
+    t = []
+    o = copy.deepcopy(e)
+
+    if o.get("jl"):
+        t.append(f"jl{o['jl']}")
+        del o["jl"]
+
+    if o.get("jt"):
+        t.append(f"jt{o['jt']}")
+        del o["jt"]
+
+    if o.get("in"):
+        t.append(f"in{o['in']}")
+        del o["in"]
+
+    if o.get("kw"):
+        t.append(f"kw{n(o['kw'])}")
+        del o["kw"]
+
+    if o.get("p"):
+        t.append(f"p{o['p']}")
+        del o["p"]
+
+    r = []
+    for key, value in o.items():
+        if value:
+            r.append(f"{key}={value}")
+
+    a_path = "/".join(t)
+    if r:
+        a_path += "?" + "&".join(r)
+
+    return a_path
+
+
+if __name__ == '__main__':
+    url = f"https://www.zhaopin.com/sou/{generate_url({'jl': 530, 'kw': 'app推广经理'})}"
+    print(url)
--- a/company_spider/zhilianzhaopin_company/spider.py
+++ b/company_spider/zhilianzhaopin_company/spider.py
@ -0,0 +1,284 @@
+from playwright.sync_api import sync_playwright, BrowserContext, Page
+import time
+import json
+import os
+from typing import List, Dict, Optional
+
+from company_spider.zhilianzhaopin_company.searcc_kw import generate_url
+
+
+class CityLoader:
+    _instance = None
+    
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(CityLoader, cls).__new__(cls)
+        return cls._instance
+
+    def __init__(self, city_file="city.json"):
+        if hasattr(self, 'city_map'):
+            return
+            
+        current_dir = os.path.dirname(os.path.abspath(__file__))
+        self.file_path = os.path.join(current_dir, city_file)
+        self.city_map = {}
+        self._load_cities()
+
+    def _load_cities(self):
+        if not os.path.exists(self.file_path):
+            print(f"City file not found: {self.file_path}")
+            return
+        
+        try:
+            with open(self.file_path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                self._parse_city_data(data.get("allCity", []))
+        except Exception as e:
+            print(f"Error loading city file: {e}")
+
+    def _parse_city_data(self, cities):
+        for city in cities:
+            self.city_map[city['name']] = city['code']
+            if 'sublist' in city and city['sublist']:
+                self._parse_city_data(city['sublist'])
+
+    def get_code(self, city_name):
+        return self.city_map.get(city_name)
+
+
+def get_companies_from_page(page: Page) -> List[Dict[str, str]]:
+    """从搜索结果页面获取公司名称和链接"""
+    companies = []
+    
+    # 尝试多种选择器来定位公司名称
+    company_selectors = [
+        'a[class*="company"]',
+        '.company-name a',
+        'a.company-name',
+        '[class*="CompanyName"] a',
+        'a[href*="/company/"]'
+    ]
+    
+    company_elements = []
+    for selector in company_selectors:
+        try:
+            elements = page.query_selector_all(selector)
+            if elements:
+                company_elements = elements
+                print(f"使用选择器找到 {len(elements)} 个元素: {selector}")
+                break
+        except:
+            continue
+    
+    # 如果没找到，尝试更通用的方法
+    if not company_elements:
+        all_links = page.query_selector_all('a[href*="company"]')
+        company_elements = all_links
+        print(f"通过通用方法找到 {len(all_links)} 个公司链接")
+    
+    # 提取公司信息
+    company_info_set = set()
+    
+    for element in company_elements:
+        try:
+            company_name = element.inner_text().strip()
+            company_url = element.get_attribute('href')
+            
+            if company_name and company_url:
+                # 处理相对路径
+                if company_url.startswith('/'):
+                    company_url = f"https://www.zhaopin.com{company_url}"
+                elif not company_url.startswith('http'):
+                    company_url = f"https://www.zhaopin.com/{company_url}"
+                
+                # 去重
+                if company_name not in company_info_set:
+                    company_info_set.add(company_name)
+                    companies.append({
+                        'name': company_name,
+                        'url': company_url
+                    })
+        except Exception as e:
+            continue
+    
+    return companies
+
+
+def get_company_intro(context: BrowserContext, company_url: str) -> str:
+    """获取公司详情简介"""
+    try:
+        company_page = context.new_page()
+        company_page.goto(company_url, wait_until="networkidle", timeout=30000)
+        time.sleep(2)
+        
+        # 尝试多种选择器获取公司简介
+        intro_selectors = [
+            '.company-intro',
+            '.company-description',
+            '[class*="intro"]',
+            '[class*="description"]',
+            '.company-info',
+            '[class*="CompanyIntro"]'
+        ]
+        
+        company_intro = ""
+        for selector in intro_selectors:
+            try:
+                intro_element = company_page.query_selector(selector)
+                if intro_element:
+                    company_intro = intro_element.inner_text().strip()
+                    if company_intro:
+                        break
+            except:
+                continue
+        
+        # 如果还是没找到，尝试获取页面主要内容
+        if not company_intro:
+            try:
+                body = company_page.query_selector('body')
+                if body:
+                    all_text = body.inner_text()
+                    company_intro = all_text[:500]
+            except:
+                pass
+        
+        company_page.close()
+        return company_intro if company_intro else "未找到公司简介"
+        
+    except Exception as e:
+        return f"获取失败: {str(e)}"
+
+
+def crawl_companies(params: Dict, max_companies: int = 10, headless: bool = False, proxy: Optional[str] = None) -> List[Dict]:
+    """
+    爬取智联招聘公司信息
+    
+    Args:
+        params: 搜索参数，如 {'jl': 530, 'kw': 'app推广经理'} 或 {'city': '北京', 'kw': '...'}
+        max_companies: 最多爬取的公司数量，默认10
+        headless: 是否无头模式，默认False
+        proxy: 代理地址，例如 "http://user:pass@host:port"
+    
+    Returns:
+        公司信息列表，每个元素包含 name, url, intro
+        如果找到完全匹配的公司名称，只返回该公司的信息（列表长度为1）
+    """
+    # 处理城市名称转代码
+    if 'city' in params and 'jl' not in params:
+        city_loader = CityLoader()
+        code = city_loader.get_code(params['city'])
+        if code:
+            print(f"城市 '{params['city']}' 映射代码为: {code}")
+            params['jl'] = code
+        else:
+            print(f"未找到城市 '{params['city']}' 的代码")
+
+    target_company = params.get('kw', '').strip()
+    
+    with sync_playwright() as p:
+        launch_args = ["--disable-blink-features=AutomationControlled"]
+        browser_kwargs = {
+            "headless": headless,
+            "args": launch_args
+        }
+        
+        # 尝试使用本地Chrome，如果不存在则使用默认浏览器
+        chrome_path = r"C:\Program Files\Google\Chrome\Application\chrome.exe"
+        if os.path.exists(chrome_path):
+            browser_kwargs["executable_path"] = chrome_path
+        
+        if proxy:
+            browser_kwargs["proxy"] = {"server": proxy}
+            print(f"使用代理: {proxy}")
+
+        browser = p.chromium.launch(**browser_kwargs)
+        
+        context = browser.new_context(
+            viewport={"width": 1920, "height": 1080},
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        )
+        
+        page = context.new_page()
+        
+        # 生成URL并访问搜索页面
+        url = f"https://www.zhaopin.com/sou/{generate_url(params)}"
+        print(f"访问URL: {url}")
+        page.goto(url, wait_until="networkidle", timeout=30000)
+        time.sleep(3)
+        
+        # 获取公司列表
+        companies = get_companies_from_page(page)
+        print(f"\n找到 {len(companies)} 家公司")
+        
+        # 如果有关键词，尝试精确匹配公司名称
+        if target_company:
+            print(f"搜索目标公司: {target_company}")
+            for company in companies:
+                company_name = company['name'].strip()
+                # 优先精确匹配，如果精确匹配失败则尝试包含匹配
+                if company_name == target_company:
+                    print(f"找到完全匹配的公司: {company_name}")
+                    print(f"正在获取公司简介...")
+                    company_intro = get_company_intro(context, company['url'])
+                    
+                    context.close()
+                    browser.close()
+                    
+                    return [{
+                        'name': company_name,
+                        'url': company['url'],
+                        'intro': company_intro
+                    }]
+            
+            # 如果精确匹配失败，尝试包含匹配
+            for company in companies:
+                company_name = company['name'].strip()
+                if target_company in company_name or company_name in target_company:
+                    print(f"找到部分匹配的公司: {company_name}")
+                    print(f"正在获取公司简介...")
+                    company_intro = get_company_intro(context, company['url'])
+                    
+                    context.close()
+                    browser.close()
+                    
+                    return [{
+                        'name': company_name,
+                        'url': company['url'],
+                        'intro': company_intro
+                    }]
+        
+        # 如果没有找到匹配的公司，按原逻辑获取多家公司
+        print(f"未找到完全匹配的公司，获取前 {max_companies} 家公司信息")
+        results = []
+        for i, company in enumerate(companies[:max_companies], 1):
+            print(f"\n[{i}/{min(max_companies, len(companies))}] 正在获取: {company['name']}")
+            company_intro = get_company_intro(context, company['url'])
+            
+            results.append({
+                'name': company['name'],
+                'url': company['url'],
+                'intro': company_intro
+            })
+            
+            time.sleep(1)
+        
+        context.close()
+        browser.close()
+        
+        return results
+
+
+if __name__ == '__main__':
+    # 测试代码
+    params = {'city': '北京', 'kw': 'app推广经理'}
+    results = crawl_companies(params, max_companies=10)
+    
+    # 输出结果
+    print("\n" + "="*80)
+    print("爬取结果:")
+    print("="*80)
+    for result in results:
+        print(f"\n公司名称: {result['name']}")
+        print(f"公司链接: {result['url']}")
+        print(f"公司简介: {result['intro'][:200]}..." if len(result['intro']) > 200 else f"公司简介: {result['intro']}")
+        print("-"*80)
--- a/jobs_spider/qcwy/crawl_stats.log
+++ b/jobs_spider/qcwy/crawl_stats.log
@ -0,0 +1,6 @@
+{"timestamp": "2026-01-15 00:38:02", "total_crawled": 517, "unique_count": 503, "duplicate_count": 14, "api_total_count": 505, "job_area": "商丘", "function_type": "8305"}
+{"timestamp": "2026-01-15 01:36:23", "total_crawled": 517, "unique_count": 509, "duplicate_count": 8, "api_total_count": 517, "job_area": "广安", "function_type": "1318"}
+{"timestamp": "2026-01-15 02:32:36", "total_crawled": 517, "unique_count": 511, "duplicate_count": 6, "api_total_count": 517, "job_area": "阜阳", "function_type": "6101"}
+{"timestamp": "2026-01-15 03:32:52", "total_crawled": 517, "unique_count": 513, "duplicate_count": 4, "api_total_count": 517, "job_area": "常德", "function_type": "3812"}
+{"timestamp": "2026-01-15 04:31:42", "total_crawled": 517, "unique_count": 510, "duplicate_count": 7, "api_total_count": 517, "job_area": "惠州", "function_type": "3335"}
+{"timestamp": "2026-01-15 05:28:54", "total_crawled": 517, "unique_count": 515, "duplicate_count": 2, "api_total_count": 517, "job_area": "锦州", "function_type": "0154"}
--- a/jobs_spider/qcwy/logs/log_2026-01-15.log
+++ b/jobs_spider/qcwy/logs/log_2026-01-15.log
--- a/jobs_spider/qcwy/qcwy.py
+++ b/jobs_spider/qcwy/qcwy.py
@ -25,7 +25,7 @@ os.makedirs("logs", exist_ok=True)
 logger.add("logs/log_{time:YYYY-MM-DD}.log", level="INFO", rotation="00:00", retention="30 days", enqueue=True)


-API_BASE_URL = os.getenv('API_BASE_URL', 'http://127.0.0.1:9999')
+API_BASE_URL = os.getenv('API_BASE_URL', 'http://124.222.106.226:9999')

 API_PUBLIC_HOST = os.getenv('API_PUBLIC_HOST')
 PROXY_URL = "http://t13319619426654:ln8aj9nl@s432.kdltps.com:15818"