win 3d202c3486 feat(05): data pipeline optimization (DATA-01, DATA-04)
Plan 01 - DATA-01: 30-day window dedup fix:
- dedup.py: both single-field and double-field SQL queries now include
  AND created_at > now() - INTERVAL 30 DAY
- tests/ingest/test_dedup.py: 6 mock tests validating 30-day window

Plan 02 - DATA-04: company vs search job channel separation:
- schemas/ingest.py: ChannelType.COMPANY = 'company'
- configs/boss.py: register channel='company' config
- configs/qcwy.py: register channel='company' config
- configs/zhilian.py: register channel='company' config
- company_jobs_sync.py: store_batch(..., 'mini', ...) → (..., 'company', ...)

DATA-02: confirmed already complete (job.py has /data/batch-async endpoint)
DATA-03: confirmed already complete (company_cleaner.py full pipeline)

Full regression: 112 passed (106 existing + 6 new)
2026-03-21 19:50:06 +08:00

82 lines
3.1 KiB
Python

from typing import Dict, Any, Optional
from app.services.ingest.registry import PlatformConfig, DedupFieldSpec, register
from app.services.ingest.remote_push import safe_get, safe_join
def _extract_number(data: Dict[str, Any]) -> Optional[str]:
val = data.get("number")
return str(val) if val else None
def _extract_fpt(data: Dict[str, Any]) -> Optional[str]:
val = data.get("firstPublishTime")
return str(val) if val else None
def _extract_company_name(data: Dict[str, Any]) -> Optional[str]:
name = data.get("companyName") or data.get("name")
return str(name) if name else None
def _build_zhilian_push(data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
skill_labels = data.get("skillLabel") or []
skill_values = [tag["value"] for tag in skill_labels if isinstance(tag, dict) and "value" in tag]
return {
"source_type": "智联招聘",
"name": safe_get(data, "companyName"),
"common_name": safe_get(data, "companyName"),
"title": safe_get(data, "name"),
"title_addr": safe_get(data, "name"),
"description": safe_get(data, "jobSummary"),
"education": safe_get(data, "education"),
"skill": safe_join(skill_values),
"welfare": "",
"years": safe_get(data, "workingExp"),
"salary": safe_get(data, "salary60"),
"location": f"{safe_get(data, 'workCity')}{safe_get(data, 'cityDistrict')}",
"position": f"{safe_get(data, 'workCity')}{safe_get(data, 'cityDistrict')}",
"job_type": safe_get(data, "workType"),
"size": safe_get(data, "companySize"),
"employer_type": safe_get(data, "propertyName"),
"industry": safe_get(data, "industryName"),
"job_1st_class": "", "job_2nd_class": "", "job_3rd_class": "", "job_4th_class": "",
"date": safe_get(data, "firstPublishTime"),
"start_date": "", "end_date": "",
"age": "", "sex": "",
"number": str(safe_get(data, "recruitNumber")),
"url": safe_get(data, "positionURL"),
"company_id": str(safe_get(data, "companyId")),
"company_name": safe_get(data, "companyName"),
"company_url": safe_get(data, "companyUrl"),
"company_desc": safe_get(data, "companyDesc"),
"base_data": data,
}
register(PlatformConfig(
platform="zhilian", channel="mini", data_type="job",
table="zhilian_job",
dedup_fields=(
DedupFieldSpec(column="number", extractor=_extract_number),
DedupFieldSpec(column="first_publish_time", extractor=_extract_fpt),
),
push_mapper=_build_zhilian_push,
))
register(PlatformConfig(
platform="zhilian", channel="mini", data_type="company",
table="zhilian_company",
dedup_fields=(DedupFieldSpec(column="company_name", extractor=_extract_company_name),),
))
# 公司关联职位(通过 company_jobs_sync 写入,与搜索职位 mini 区分)
register(PlatformConfig(
platform="zhilian", channel="company", data_type="job",
table="zhilian_job",
dedup_fields=(
DedupFieldSpec(column="number", extractor=_extract_number),
DedupFieldSpec(column="first_publish_time", extractor=_extract_fpt),
),
))