Plan 01 - DATA-01: 30-day window dedup fix: - dedup.py: both single-field and double-field SQL queries now include AND created_at > now() - INTERVAL 30 DAY - tests/ingest/test_dedup.py: 6 mock tests validating 30-day window Plan 02 - DATA-04: company vs search job channel separation: - schemas/ingest.py: ChannelType.COMPANY = 'company' - configs/boss.py: register channel='company' config - configs/qcwy.py: register channel='company' config - configs/zhilian.py: register channel='company' config - company_jobs_sync.py: store_batch(..., 'mini', ...) → (..., 'company', ...) DATA-02: confirmed already complete (job.py has /data/batch-async endpoint) DATA-03: confirmed already complete (company_cleaner.py full pipeline) Full regression: 112 passed (106 existing + 6 new)
104 lines
3.9 KiB
Python
104 lines
3.9 KiB
Python
from typing import Dict, Any, Optional
|
|
|
|
from app.services.ingest.registry import PlatformConfig, DedupFieldSpec, register
|
|
from app.services.ingest.remote_push import safe_join
|
|
|
|
|
|
def _extract_job_id(data: Dict[str, Any]) -> Optional[str]:
|
|
val = data.get("jobId")
|
|
return str(val) if val else None
|
|
|
|
|
|
def _extract_update_dt(data: Dict[str, Any]) -> Optional[str]:
|
|
val = data.get("updateDateTime")
|
|
return str(val) if val else None
|
|
|
|
|
|
def _extract_company_name(data: Dict[str, Any]) -> Optional[str]:
|
|
name = data.get("companyName") or data.get("company_name")
|
|
return str(name) if name else None
|
|
|
|
|
|
def _build_qcwy_push(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
|
welfare_list = data.get("jobWelfareCodeDataList")
|
|
if isinstance(welfare_list, list):
|
|
welfare_str = ",".join(
|
|
str(item.get("chineseTitle") or item.get("typeTitle") or item.get("englishTitle") or item.get("code"))
|
|
for item in welfare_list if isinstance(item, dict)
|
|
)
|
|
elif isinstance(welfare_list, str):
|
|
welfare_str = welfare_list.replace("[", "").replace("]", "")
|
|
else:
|
|
welfare_str = ""
|
|
|
|
raw_location = data.get("location") or ""
|
|
if not raw_location:
|
|
work_loc = data.get("workLocation") or {}
|
|
raw_location = work_loc.get("workAddress") or work_loc.get("address") or ""
|
|
location_val = raw_location or "位置信息未找到"
|
|
|
|
raw_area = data.get("jobAreaString") or ""
|
|
if not raw_area:
|
|
level_detail = data.get("jobAreaLevelDetail") or {}
|
|
city_str = level_detail.get("cityString") or ""
|
|
landmark_str = level_detail.get("landMarkString") or ""
|
|
raw_area = f"{city_str}{landmark_str}".strip()
|
|
area_val = raw_area or "位置信息未找到"
|
|
|
|
return {
|
|
"source_type": "前程无忧",
|
|
"name": data.get("companyName"),
|
|
"title": data.get("jobName"),
|
|
"title_addr": data.get("jobName"),
|
|
"description": data.get("jobDescribe"),
|
|
"age": "", "sex": "", "number": "",
|
|
"education": data.get("degreeString"),
|
|
"skill": safe_join(data.get("jobTagsForOrder")),
|
|
"welfare": welfare_str,
|
|
"years": data.get("workYearString"),
|
|
"salary": f'{data.get("jobSalaryMax", "")}-{data.get("jobSalaryMin", "")}',
|
|
"location": location_val,
|
|
"position": area_val,
|
|
"date": data.get("confirmDateString"),
|
|
"start_date": data.get("confirmDateString"),
|
|
"end_date": "",
|
|
"job_type": data.get("termStr"),
|
|
"size": data.get("companySizeString"),
|
|
"employer_type": data.get("companyTypeString"),
|
|
"industry": f'{data.get("major1Str", "")}-{data.get("major2Str", "")}',
|
|
"job_1st_class": "", "job_2nd_class": "", "job_3rd_class": "", "job_4th_class": "",
|
|
"url": data.get("jobHref"),
|
|
"company_id": data.get("coId"),
|
|
"company_name": data.get("fullCompanyName"),
|
|
"company_url": data.get("companyHref"),
|
|
"company_desc": data.get("company_desc", ""),
|
|
"base_data": data,
|
|
}
|
|
|
|
|
|
register(PlatformConfig(
|
|
platform="qcwy", channel="mini", data_type="job",
|
|
table="qcwy_job",
|
|
dedup_fields=(
|
|
DedupFieldSpec(column="job_id", extractor=_extract_job_id),
|
|
DedupFieldSpec(column="update_date_time", extractor=_extract_update_dt),
|
|
),
|
|
push_mapper=_build_qcwy_push,
|
|
))
|
|
|
|
register(PlatformConfig(
|
|
platform="qcwy", channel="mini", data_type="company",
|
|
table="qcwy_company",
|
|
dedup_fields=(DedupFieldSpec(column="company_name", extractor=_extract_company_name),),
|
|
))
|
|
|
|
# 公司关联职位(通过 company_jobs_sync 写入,与搜索职位 mini 区分)
|
|
register(PlatformConfig(
|
|
platform="qcwy", channel="company", data_type="job",
|
|
table="qcwy_job",
|
|
dedup_fields=(
|
|
DedupFieldSpec(column="job_id", extractor=_extract_job_id),
|
|
DedupFieldSpec(column="update_date_time", extractor=_extract_update_dt),
|
|
),
|
|
))
|