from clickhouse_connect.driver import AsyncClient from app.log import logger class ClickHouseInitializer: """ClickHouse数据库初始化器""" # 6张数据表的 DDL 定义(含 channel 列) _TABLE_DDLS = { "boss_job": """ CREATE TABLE IF NOT EXISTS job_data.boss_job ( id UInt64 DEFAULT 0, json_data String DEFAULT '', job_id String DEFAULT '', channel String DEFAULT 'mini', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now() ) ENGINE = MergeTree() ORDER BY created_at SETTINGS index_granularity = 8192; """, "boss_company": """ CREATE TABLE IF NOT EXISTS job_data.boss_company ( id UInt64 DEFAULT 0, json_data String DEFAULT '', company_name String DEFAULT '', channel String DEFAULT 'mini', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now() ) ENGINE = MergeTree() ORDER BY created_at SETTINGS index_granularity = 8192; """, "qcwy_job": """ CREATE TABLE IF NOT EXISTS job_data.qcwy_job ( id UInt64 DEFAULT 0, json_data String DEFAULT '', job_id String DEFAULT '', update_date_time String DEFAULT '', channel String DEFAULT 'mini', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now() ) ENGINE = MergeTree() ORDER BY created_at SETTINGS index_granularity = 8192; """, "qcwy_company": """ CREATE TABLE IF NOT EXISTS job_data.qcwy_company ( id UInt64 DEFAULT 0, json_data String DEFAULT '', company_name String DEFAULT '', channel String DEFAULT 'mini', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now() ) ENGINE = MergeTree() ORDER BY created_at SETTINGS index_granularity = 8192; """, "zhilian_job": """ CREATE TABLE IF NOT EXISTS job_data.zhilian_job ( id UInt64 DEFAULT 0, json_data String DEFAULT '', number String DEFAULT '', first_publish_time String DEFAULT '', channel String DEFAULT 'mini', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now() ) ENGINE = MergeTree() ORDER BY created_at SETTINGS index_granularity = 8192; """, "zhilian_company": """ CREATE TABLE IF NOT EXISTS job_data.zhilian_company ( id UInt64 DEFAULT 0, json_data String DEFAULT '', company_name String DEFAULT '', channel String DEFAULT 'mini', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now() ) ENGINE = MergeTree() ORDER BY created_at SETTINGS index_granularity = 8192; """, } _PENDING_COMPANY_DDL = """ CREATE TABLE IF NOT EXISTS job_data.pending_company ( source String, company_id String, company_name String DEFAULT '', status String DEFAULT 'pending', error_msg String DEFAULT '', created_at DateTime DEFAULT now(), updated_at DateTime DEFAULT now(), version UInt64 DEFAULT 1 ) ENGINE = ReplacingMergeTree(version) ORDER BY (source, company_id) SETTINGS index_granularity = 8192; """ _JOB_ANALYTICS_VIEW = """ CREATE OR REPLACE VIEW job_data.job_analytics AS SELECT 'boss' as source, job_id, channel, JSONExtractString(json_data, 'jobName') as position_name, JSONExtractString(json_data, 'brandName') as company_name, JSONExtractString(json_data, 'salaryDesc') as salary_text, 0.0 as salary_min, 0.0 as salary_max, JSONExtractString(json_data, 'cityName') as city, JSONExtractString(json_data, 'experienceName') as experience_required, JSONExtractString(json_data, 'degreeName') as education, created_at FROM job_data.boss_job UNION ALL SELECT 'qcwy' as source, job_id, channel, JSONExtractString(json_data, 'jobName') as position_name, JSONExtractString(json_data, 'companyName') as company_name, JSONExtractString(json_data, 'provideSalaryString') as salary_text, 0.0, 0.0, JSONExtractString(json_data, 'workCity') as city, JSONExtractString(json_data, 'workYear') as experience_required, JSONExtractString(json_data, 'degree') as education, created_at FROM job_data.qcwy_job UNION ALL SELECT 'zhilian' as source, number as job_id, channel, JSONExtractString(json_data, 'jobName') as position_name, JSONExtractString(json_data, 'companyName') as company_name, JSONExtractString(json_data, 'salary60') as salary_text, 0.0, 0.0, JSONExtractString(json_data, 'workCity') as city, JSONExtractString(json_data, 'workingExp') as experience_required, JSONExtractString(json_data, 'education') as education, created_at FROM job_data.zhilian_job """ # 需要添加 channel 列的表 _CHANNEL_MIGRATION_TABLES = [ "boss_job", "boss_company", "qcwy_job", "qcwy_company", "zhilian_job", "zhilian_company", ] def __init__(self, client: AsyncClient): self.client = client async def _create_table(self, name: str, ddl: str) -> None: try: await self.client.command(ddl) logger.info(f"表 {name} 创建成功") except Exception as e: logger.error(f"创建表 {name} 失败: {e}") raise async def initialize_channel_migration(self) -> None: """对已存在的表执行 ALTER TABLE ADD COLUMN IF NOT EXISTS channel""" for table in self._CHANNEL_MIGRATION_TABLES: try: await self.client.command( f"ALTER TABLE job_data.{table} " f"ADD COLUMN IF NOT EXISTS channel String DEFAULT 'mini'" ) logger.info(f"表 {table} channel 列迁移完成") except Exception as e: logger.warning(f"表 {table} channel 列迁移跳过: {e}") async def initialize_all_tables(self) -> None: """初始化所有表""" logger.info("开始初始化 ClickHouse 数据库表...") try: # 创建6张数据表 for name, ddl in self._TABLE_DDLS.items(): await self._create_table(name, ddl) # 创建待处理公司表 await self._create_table("pending_company", self._PENDING_COMPANY_DDL) # 对已存在的表添加 channel 列 await self.initialize_channel_migration() # 创建/重建统一分析视图(含 channel 列) try: await self.client.command(self._JOB_ANALYTICS_VIEW) logger.info("招聘数据分析视图 job_analytics 创建成功") except Exception as e: logger.error(f"创建招聘数据分析视图失败: {e}") raise logger.info("ClickHouse 数据库表初始化完成") except Exception as e: logger.error(f"ClickHouse 数据库初始化失败: {e}") raise