提交 8dc73b9f authored 作者: 宋宏伟's avatar 宋宏伟

update

上级 0a6d0920
import time
from datetime import datetime, timedelta
import logging
from ETL_diagnosis import *
from ETL_drug import *
from ETL_lab import *
from ETL_patient import *
from ETL_visit import *
class BeijingTimeFormatter(logging.Formatter):
"""Custom logging formatter to adjust log timestamps to Beijing time (UTC+8)."""
def formatTime(self, record, datefmt=None):
ct = datetime.fromtimestamp(record.created) + timedelta(hours=8) # Adjust for Beijing time
if datefmt:
s = ct.strftime(datefmt)
else:
try:
s = ct.isoformat(timespec='milliseconds')
except TypeError:
s = ct.isoformat()
return s
def format(self, record):
record.asctime = self.formatTime(record, self.datefmt)
return super(BeijingTimeFormatter, self).format(record)
# 创建日志处理器
file_handler = logging.FileHandler('etl_run.log')
stream_handler = logging.StreamHandler()
# 设置自定义的时间格式化器
formatter = BeijingTimeFormatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)
# 设置日志配置,使用自定义的时间格式化器
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[file_handler, stream_handler]
)
def parse_date(date_string):
"""解析日期字符串为 datetime 对象,捕获并处理可能的格式错误"""
try:
return datetime.strptime(date_string, "%Y-%m-%d")
except ValueError as e:
print(f"日期格式错误: {date_string}. 错误信息: {e}")
raise # 向上抛出异常以停止处理
def run_etl(pv_id, table_name, data_start_time, data_end_time):
data_start_times = parse_date(data_start_time)
data_end_times = parse_date(data_end_time)
data_start_times_2 = data_start_times - timedelta(days=30)
data_end_times_2 = data_end_times + timedelta(days=60)
data_start_times_utc = data_start_times.strftime("%Y-%m-%d %H:%M:%S") + "+00:00"
data_end_times_utc = data_end_times.strftime("%Y-%m-%d %H:%M:%S") + "+00:00"
data_start_times_2_utc = data_start_times_2.strftime("%Y-%m-%d %H:%M:%S") + "+00:00"
data_end_times_2_utc = data_end_times_2.strftime("%Y-%m-%d %H:%M:%S") + "+00:00"
try:
log_message = f"开始执行 ETL 任务,数据时间范围: {data_start_time} 至 {data_end_time}..."
print(log_message)
logging.info(log_message)
if table_name == 'patient':
etl_patient(pv_id, data_start_time, data_end_time, data_start_times_utc, data_end_times_utc,
data_start_times_2_utc, data_end_times_2_utc)
elif table_name == 'visit':
etl_visit(pv_id, data_start_time, data_end_time, data_start_times_utc, data_end_times_utc,
data_start_times_2_utc, data_end_times_2_utc)
elif table_name == 'prescribing':
etl_prescribing(pv_id, data_start_time, data_end_time, data_start_times_utc, data_end_times_utc,
data_start_times_2_utc, data_end_times_2_utc)
elif table_name == 'diagnosis':
etl_diagnosis(pv_id, data_start_time, data_end_time, data_start_times_utc, data_end_times_utc,
data_start_times_2_utc, data_end_times_2_utc)
elif table_name == 'lab':
etl_lab_result_cm(pv_id, data_start_time, data_end_time, data_start_times_utc, data_end_times_utc,
data_start_times_2_utc, data_end_times_2_utc)
except Exception as e:
print(f"ETL 任务执行失败: {e}")
logging.error(f"ETL 任务执行失败: {e}")
date_ranges = [
["2021-01-01", "2021-07-01"],
["2021-07-01", "2022-01-01"],
["2022-01-01", "2022-07-01"],
["2022-07-01", "2023-01-01"],
["2023-01-01", "2023-07-01"],
["2023-07-01", "2024-01-01"],
["2024-01-01", "2024-07-01"],
["2024-07-01", "2024-10-01"],
]
pv_ids = ['320106426090445', '320104466002630', '320106466000838']
tables = ['patient', 'visit', 'prescribing', 'diagnosis', 'lab']
etl_start_time = time.time()
for pv_id in pv_ids:
for table_name in tables:
for start_date, end_date in date_ranges:
run_etl(pv_id, table_name, start_date, end_date)
etl_end_time = time.time()
total_time = etl_end_time - etl_start_time
print(f"所有 ETL 任务共耗时: {total_time:.2f} 秒")
logging.info(f"所有 ETL 任务共耗时: {total_time:.2f} 秒")
from data_query import *
def etl_diagnosis(pv_id,d_start_time,d_end_time,d_start_time_utc,d_end_time_utc,d_start_time2_utc,d_end_time2_utc):
output_path = r'./diagnosis.csv'
queries = [
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
date_of_visiting AS admission_datetime,
NULL AS discharge_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = TRUE THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '{d_start_time}'
AND CAST(date_of_visiting AS DATE) <= DATE '{d_end_time}'),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id),
t7 AS (
SELECT DISTINCT
*
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
AND (diagnosis_name ~* '白内障|视网膜|眼|黄斑|玻璃体|玻血|网脱|失明|弱视|视力'
OR diagnosis_name ~* '黄斑水肿|失明|眼球?萎缩|眼球?缺失|盲目(3|三)|(视力|视觉)重度|神经'
OR diagnosis_name ~* '截肢|切断|截断|截|蛋白尿|蛋白尿|肾?移植|透析?|尿毒症|CKD(5|Ⅴ|五)|肾.{0,4}终末|终末.{0,4}肾'
OR diagnosis_name ~* '(颈|髂总|髂内|肾脏|肢端|腹主|肢|肾小)?主?动脉(粥样硬|痉挛|坏疽|硬|瘤|炎|栓塞|血栓)?化?|间歇性?跛行|红斑性肢痛|(伯|柏)格|雷诺氏|周围血管疾?病|动脉(肌纤维发育异|坏疽|痉挛)?|主动脉(瘤|炎)?|主动脉(粥样)?硬化|(静脉)?曲张|血栓性静脉|下肢(深静脉血栓|静脉曲张|动脉闭塞|血栓性静脉炎|静脉功能不全|静脉炎|(血管|动脉)闭塞症|静脉肌间血栓形成)?|周围循环'
OR diagnosis_name ~* '冠(心病|状|脉)|旁路移植|搭桥|多支|PCI|心绞痛|动脉硬化.{0,3}心脏病|心(肌|脏)(缺|供)血|缺血性心(脏|肌)病|心肌?梗'
OR diagnosis_name ~* '心梗|心肌梗死|心痛|陈旧(性|型)?(心|ST|非ST|Q|前|侧|下|高|间|广泛|(左|右)心室)|心肌梗塞|胸痹'
OR diagnosis_name ~* '脑.{0,3}(梗|塞|死)|卒中|中风'
OR diagnosis_name ~* '心(力|室|房)?衰(竭)?|心功能不全|心功能.*级|心源性哮喘|低心排综合征|KILLIP.*级|HBP|高血压'
OR diagnosis_name ~* '血脂异常|(胆固醇|高脂|甘油三?(脂|酯))血症|高血脂|高粘血症|高(密度)?(酯|脂)蛋白|低(密度)?(酯|脂)蛋白|高?三酰甘油'
OR diagnosis_name ~* '糖尿病')
)
select distinct
b.patient_id,
b.visit_id,
b.patient_type,
a.diagnosis_time as diagnosis_datetime,
b.specialty,
a.diagnosis_code as dx,
a.diagnosis_name as dx_desc,
a.diagnosis_code as dicd10_code,
a.diagnosis_type_name as dx_source,
a.is_primary as pdx,
a.diagnosis_name as icd10_name,
b.provider_id
from t7 a
join t6 b on a.visit_record_id = b.visit_id;
""",
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
admission_time AS admission_datetime,
discharge_time AS discharge_datetime,
admission_specialty_name AS specialty,
'住院' AS patient_type
FROM iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '{d_start_time_utc}'
AND discharge_time < TIMESTAMP '{d_end_time_utc}'
),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id),
t7 AS (
SELECT DISTINCT
*
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
AND (diagnosis_name ~* '白内障|视网膜|眼|黄斑|玻璃体|玻血|网脱|失明|弱视|视力'
OR diagnosis_name ~* '黄斑水肿|失明|眼球?萎缩|眼球?缺失|盲目(3|三)|(视力|视觉)重度|神经'
OR diagnosis_name ~* '截肢|切断|截断|截|蛋白尿|蛋白尿|肾?移植|透析?|尿毒症|CKD(5|Ⅴ|五)|肾.{0,4}终末|终末.{0,4}肾'
OR diagnosis_name ~* '(颈|髂总|髂内|肾脏|肢端|腹主|肢|肾小)?主?动脉(粥样硬|痉挛|坏疽|硬|瘤|炎|栓塞|血栓)?化?|间歇性?跛行|红斑性肢痛|(伯|柏)格|雷诺氏|周围血管疾?病|动脉(肌纤维发育异|坏疽|痉挛)?|主动脉(瘤|炎)?|主动脉(粥样)?硬化|(静脉)?曲张|血栓性静脉|下肢(深静脉血栓|静脉曲张|动脉闭塞|血栓性静脉炎|静脉功能不全|静脉炎|(血管|动脉)闭塞症|静脉肌间血栓形成)?|周围循环'
OR diagnosis_name ~* '冠(心病|状|脉)|旁路移植|搭桥|多支|PCI|心绞痛|动脉硬化.{0,3}心脏病|心(肌|脏)(缺|供)血|缺血性心(脏|肌)病|心肌?梗'
OR diagnosis_name ~* '心梗|心肌梗死|心痛|陈旧(性|型)?(心|ST|非ST|Q|前|侧|下|高|间|广泛|(左|右)心室)|心肌梗塞|胸痹'
OR diagnosis_name ~* '脑.{0,3}(梗|塞|死)|卒中|中风'
OR diagnosis_name ~* '心(力|室|房)?衰(竭)?|心功能不全|心功能.*级|心源性哮喘|低心排综合征|KILLIP.*级|HBP|高血压'
OR diagnosis_name ~* '血脂异常|(胆固醇|高脂|甘油三?(脂|酯))血症|高血脂|高粘血症|高(密度)?(酯|脂)蛋白|低(密度)?(酯|脂)蛋白|高?三酰甘油'
OR diagnosis_name ~* '糖尿病')
)
select distinct
b.patient_id,
b.visit_id,
b.patient_type,
a.diagnosis_time as diagnosis_datetime,
b.specialty,
a.diagnosis_code as dx,
a.diagnosis_name as dx_desc,
a.diagnosis_code as dicd10_code,
a.diagnosis_type_name as dx_source,
a.is_primary as pdx,
a.diagnosis_name as icd10_name,
b.provider_id
from t7 a
join t6 b on a.visit_record_id = b.visit_id;
"""
]
# 调用函数
execute_queries_and_write_to_csv(queries, output_path)
from data_query import *
def etl_prescribing(pv_id,d_start_time,d_end_time,d_start_time_utc,d_end_time_utc,d_start_time2_utc,d_end_time2_utc):
output_path = r'./prescribing.csv'
queries = [
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
date_of_visiting AS admission_datetime,
NULL AS discharge_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = TRUE THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '{d_start_time}'
AND CAST(date_of_visiting AS DATE) <= DATE '{d_end_time}'),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id),
t7 AS (
SELECT *
FROM iceberg.cdm.outpat_recipe_detail
WHERE prescription_time >= TIMESTAMP '{d_start_time2_utc}'
AND prescription_time < TIMESTAMP '{d_end_time2_utc}'
AND (
drug_name ~* '格列(本脲|吡嗪|喹酮|齐特|美脲|波脲)|甲苯磺丁脲|氯磺丙脲|优降糖|达安疗|美吡达|瑞易宁|秦苏|迪沙|依吡达|优哒灵|元坦|麦林格|唐贝克|曼迪宝|美吡达|糖适平|捷适|达美康|弗莱因|弘旭阳|谐尔平|亚莫利|万苏平|佑苏|力贻苹|迪北|安多美|科德平|伊瑞|佳和洛|普仁平|克糖利'
OR drug_name ~* '(瑞|那|米)格列奈|诺和龙|弗来迪|唐力|唐瑞|贝加|快如妥'
OR drug_name ~* '二甲双胍|甲福明|格华止|奈达|泰白|至力|倍顺|麦克罗辛|麦特美|唐必呋|亿恒|仁欣|悦达宁|力乐尔|卜可|迪化唐锭|美迪康|君士达新|唐落|山姆士|君力达'
OR drug_name ~* '(罗|吡)格列酮|文迪雅|奥洛华|爱能|太罗|维戈洛|宜力喜|圣敏|耐迪|安瑞宁|艾可拓|卡司平|顿灵|贝唐宁|佳普喜|安可妥|凯宝维元|艾汀|卡司平|瑞彤|列洛|夷友'
OR drug_name ~* '(阿卡|伏格列)波糖|米格列醇|拜唐苹|卡博平|贝希|倍欣|华怡平|德赛天|米格尼醇|Glyset|奥恬苹|瑞舒'
OR drug_name ~* '(西|维|沙|利|阿)格列汀'
OR drug_name ~* '(达|恩|坎|伊|托)格列净'
OR drug_name ~* '(西|沙|维|利)格列汀|恩格列净|欧唐静'
OR drug_name ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
OR drug_name ~* '(艾塞那|利司那|贝那鲁|利拉鲁|聚乙二醇洛塞那|司美格鲁|度拉糖)肽|百泌达|百达扬|利时敏|谊生泰|诺和力|弗来美|诺和泰|度易达')
)
select distinct
b.patient_id,
b.visit_id,
b.patient_type,
b.specialty,
a.drug_name as rx_desc,
a.prescription_time as order_datetime,
null as rx_start_datetime,
null as rx_end_datetime,
a.dose as dosage_qty,
a.dose_unit_name as dosage_unit,
a.frequency_code as frequency,
a.qty as quantity,
null as quantity_uom,
a.route_name as roa,
a.specs as drug_spec,
a.day_num as days_supply,
b.provider_id
from t7 a
join t6 b on a.visit_record_id = b.visit_id;
""",
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
admission_time AS admission_datetime,
discharge_time AS discharge_datetime,
admission_specialty_name AS specialty,
'住院' AS patient_type
FROM iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '{d_start_time_utc}'
AND discharge_time < TIMESTAMP '{d_end_time_utc}'
),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id),
t7 AS (
SELECT *
FROM iceberg.cdm.inpat_drug_order
WHERE input_time >= TIMESTAMP '{d_start_time2_utc}'
AND input_time < TIMESTAMP '{d_end_time2_utc}'
AND (
drug_name ~* '格列(本脲|吡嗪|喹酮|齐特|美脲|波脲)|甲苯磺丁脲|氯磺丙脲|优降糖|达安疗|美吡达|瑞易宁|秦苏|迪沙|依吡达|优哒灵|元坦|麦林格|唐贝克|曼迪宝|美吡达|糖适平|捷适|达美康|弗莱因|弘旭阳|谐尔平|亚莫利|万苏平|佑苏|力贻苹|迪北|安多美|科德平|伊瑞|佳和洛|普仁平|克糖利'
OR drug_name ~* '(瑞|那|米)格列奈|诺和龙|弗来迪|唐力|唐瑞|贝加|快如妥'
OR drug_name ~* '二甲双胍|甲福明|格华止|奈达|泰白|至力|倍顺|麦克罗辛|麦特美|唐必呋|亿恒|仁欣|悦达宁|力乐尔|卜可|迪化唐锭|美迪康|君士达新|唐落|山姆士|君力达'
OR drug_name ~* '(罗|吡)格列酮|文迪雅|奥洛华|爱能|太罗|维戈洛|宜力喜|圣敏|耐迪|安瑞宁|艾可拓|卡司平|顿灵|贝唐宁|佳普喜|安可妥|凯宝维元|艾汀|卡司平|瑞彤|列洛|夷友'
OR drug_name ~* '(阿卡|伏格列)波糖|米格列醇|拜唐苹|卡博平|贝希|倍欣|华怡平|德赛天|米格尼醇|Glyset|奥恬苹|瑞舒'
OR drug_name ~* '(西|维|沙|利|阿)格列汀'
OR drug_name ~* '(达|恩|坎|伊|托)格列净'
OR drug_name ~* '(西|沙|维|利)格列汀|恩格列净|欧唐静'
OR drug_name ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
OR drug_name ~* '(艾塞那|利司那|贝那鲁|利拉鲁|聚乙二醇洛塞那|司美格鲁|度拉糖)肽|百泌达|百达扬|利时敏|谊生泰|诺和力|弗来美|诺和泰|度易达')
)
select DISTINCT
b.patient_id,
b.visit_id,
b.patient_type,
b.specialty,
a.drug_name as rx_desc,
a.input_time as order_datetime,
a.begin_time as rx_start_datetime,
a.end_time as rx_end_datetime,
a.dose as dosage_qty,
a.dose_unit_name as dosage_unit,
a.frequency_code as frequency,
a.qty as quantity,
null as quantity_uom,
a.route_name as roa,
a.specs as drug_spec,
null as days_supply,
b.provider_id
from t7 a
join t6 b on a.visit_record_id = b.visit_id;
"""
]
# 调用函数
execute_queries_and_write_to_csv(queries, output_path)
from data_query import *
def etl_lab_result_cm(pv_id,d_start_time,d_end_time,d_start_time_utc,d_end_time_utc,d_start_time2_utc,d_end_time2_utc):
output_path = r'./lab_result_cm.csv'
queries = [
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
date_of_visiting AS admission_datetime,
NULL AS discharge_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = TRUE THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '{d_start_time}'
AND CAST(date_of_visiting AS DATE) <= DATE '{d_end_time}'),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id),
t7 AS (
select DISTINCT * from iceberg.cdm.lab_report_result
where (test_item_name ~* 'C肽|C-PR' and test_item_name ~* '空腹|1|60|2|120|3|180')
or (test_item_name ~* '空腹|FPG|空腹血糖' and test_item_name ~* '血')
or (test_item_name ~* 'OGTT|耐量|负荷' and test_item_name ~* '2|120')
)
select DISTINCT
b.patient_id,
b.visit_id,
b.patient_type,
a.report_name as lab_name,
c.test_item_name as lab_item_name,
null as std_lab_item_name,
-- a.specimen_name as specimen_source,
null as specimen_source,
null as lab_order_datetime,
a.specimen_collected_time as specimen_datetime,
a.report_time as result_datetime,
c.numerical_value as result_num,
c.unit_name as result_unit,
c.reference_range as result_range,
c.text_value as result_qual,
c.critical_flag as abnormal_ind,
b.provider_id
from (select *
from iceberg.cdm.lab_report
where report_time >= TIMESTAMP '{d_start_time2_utc}'
AND report_time < TIMESTAMP '{d_end_time2_utc}') a
join t6 b
on a.visit_record_id = b.visit_id
join t7 c on c.result_source_id = a.report_source_id;
""",
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
admission_time AS admission_datetime,
discharge_time AS discharge_datetime,
admission_specialty_name AS specialty,
'住院' AS patient_type
FROM iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '{d_start_time_utc}'
AND discharge_time < TIMESTAMP '{d_end_time_utc}'
),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id),
t7 AS (
select DISTINCT * from iceberg.cdm.lab_report_result
where (test_item_name ~* 'C肽|C-PR' and test_item_name ~* '空腹|1|60|2|120|3|180')
or (test_item_name ~* '空腹|FPG|空腹血糖' and test_item_name ~* '血')
or (test_item_name ~* 'OGTT|耐量|负荷' and test_item_name ~* '2|120')
)
select DISTINCT
b.patient_id,
b.visit_id,
b.patient_type,
a.report_name as lab_name,
c.test_item_name as lab_item_name,
null as std_lab_item_name,
a.specimen_name as specimen_source,
null as lab_order_datetime,
a.specimen_collected_time as specimen_datetime,
a.report_time as result_datetime,
c.numerical_value as result_num,
c.unit_name as result_unit,
c.reference_range as result_range,
c.text_value as result_qual,
c.critical_flag as abnormal_ind,
b.provider_id
from (select *
from iceberg.cdm.lab_report
where report_time >= TIMESTAMP '{d_start_time2_utc}'
AND report_time < TIMESTAMP '{d_end_time2_utc}') a
join t6 b
on a.visit_record_id = b.visit_id
join t7 c on c.result_source_id = a.report_source_id;
"""
]
# 调用函数
execute_queries_and_write_to_csv(queries, output_path)
from data_query import *
def etl_patient(pv_id,d_start_time,d_end_time,d_start_time_utc,d_end_time_utc,d_start_time2_utc,d_end_time2_utc):
output_path = r'./patient.csv'
queries = [
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
date_of_visiting AS admission_datetime,
NULL AS discharge_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = TRUE THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '{d_start_time}'
AND CAST(date_of_visiting AS DATE) <= DATE '{d_end_time}'),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id)
SELECT DISTINCT
a.patient_id,
b.gender_name as raw_sex,
CASE
WHEN b.gender_name = '男性' THEN '男'
WHEN b.gender_name = '女性' THEN '女'
ELSE null END AS sex,
b.date_of_birth as birth_date,
a.provider_id
FROM t6 a
join (select * from iceberg.cdm.patient_base_info
WHERE organization_id = '{pv_id}'
) b
on a.provider_id = b.organization_id
and a.patient_id = b.pat_base_id;
""",
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
admission_time AS admission_datetime,
discharge_time AS discharge_datetime,
admission_specialty_name AS specialty,
'住院' AS patient_type
FROM iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '{d_start_time2_utc}'
AND discharge_time < TIMESTAMP '{d_end_time2_utc}'
),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id)
SELECT DISTINCT
a.patient_id,
b.gender_name as raw_sex,
CASE
WHEN b.gender_name = '男性' THEN '男'
WHEN b.gender_name = '女性' THEN '女'
ELSE null END AS sex,
b.date_of_birth as birth_date,
a.provider_id
FROM t6 a
join (select * from iceberg.cdm.patient_base_info
WHERE organization_id = '{pv_id}'
) b
on a.provider_id = b.organization_id
and a.patient_id = b.pat_base_id;
"""
]
# 调用函数
execute_queries_and_write_to_csv(queries, output_path)
from data_query import *
def etl_visit(pv_id,d_start_time,d_end_time,d_start_time_utc,d_end_time_utc,d_start_time2_utc,d_end_time2_utc):
output_path = r'./visit.csv'
queries = [
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
date_of_visiting AS admission_datetime,
NULL AS discharge_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = TRUE THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '{d_start_time}'
AND CAST(date_of_visiting AS DATE) <= DATE '{d_end_time}'),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id)
select * from t6;
""",
f"""
WITH
t1 AS (SELECT DISTINCT
pat_base_id AS patient_id,
visit_record_id AS visit_id,
organization_id AS provider_id
FROM iceberg.cdm.visit_record
WHERE organization_id = '{pv_id}'
),
t2 AS (SELECT DISTINCT
visit_record_id AS visit_id
FROM iceberg.cdm.patient_diagnosis
WHERE diagnosis_name LIKE '%糖尿病%'
AND diagnosis_time >= TIMESTAMP '{d_start_time2_utc}'
AND diagnosis_time < TIMESTAMP '{d_end_time2_utc}'
),
t3 AS (SELECT DISTINCT
visit_record_id AS visit_id,
admission_time AS admission_datetime,
discharge_time AS discharge_datetime,
admission_specialty_name AS specialty,
'住院' AS patient_type
FROM iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '{d_start_time_utc}'
AND discharge_time < TIMESTAMP '{d_end_time_utc}'
),
t4 AS (SELECT DISTINCT a.patient_id
FROM t1 a
JOIN t2 b ON a.visit_id = b.visit_id),
t5 AS (SELECT b.patient_id, b.visit_id, b.provider_id
FROM t4 a
JOIN t1 b ON a.patient_id = b.patient_id),
t6 AS (SELECT DISTINCT a.patient_id,
a.visit_id,
b.admission_datetime,
b.discharge_datetime,
b.specialty,
b.patient_type,
a.provider_id
FROM t5 a
JOIN t3 b ON a.visit_id = b.visit_id)
select * from t6;
"""
]
# 调用函数
execute_queries_and_write_to_csv(queries, output_path)
import logging
from datetime import datetime, timedelta
from flightsql import FlightSQLClient
import pandas as pd
import os
class BeijingTimeFormatter(logging.Formatter):
"""自定义日志格式器,将日志时间戳调整为北京时间(UTC+8)。"""
def formatTime(self, record, datefmt=None):
bj_time = datetime.fromtimestamp(record.created) + timedelta(hours=8)
return bj_time.strftime('%Y-%m-%d %H:%M:%S')
# 创建日志处理器
file_handler = logging.FileHandler('etl_run.log')
stream_handler = logging.StreamHandler()
# 设置自定义格式器
formatter = BeijingTimeFormatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
stream_handler.setFormatter(formatter)
# 使用自定义格式器配置日志
logging.basicConfig(
level=logging.INFO,
handlers=[file_handler, stream_handler]
)
def execute_query(sql_query):
"""
执行SQL查询并返回结果为pandas DataFrame。
:param sql_query: str, SQL查询语句
:return: 包含查询结果的pandas DataFrame
"""
try:
# 创建FlightSQLClient实例
client = FlightSQLClient(host='192.168.101.45', port=50802,
insecure=True, disable_server_verification=True, token=True)
# 执行SQL查询并获取结果信息
info = client.execute(sql_query)
# 初始化一个空列表以存储数据框
data_frames = []
# 遍历所有端点,获取数据并转换为DataFrame
for endpoint in info.endpoints:
reader = client.do_get(endpoint.ticket)
data_frame = reader.read_all().to_pandas()
data_frames.append(data_frame)
# 合并所有数据框
final_data_frame = pd.concat(data_frames, ignore_index=True)
return final_data_frame
except Exception as e:
logging.error(f"发生错误: {e}")
return None
def execute_queries_and_write_to_csv(queries, output_path):
"""
执行多个查询并将结果追加到CSV文件。
:param queries: SQL查询语句列表
:param output_path: 输出CSV文件路径
"""
for i, query in enumerate(queries):
df = execute_query(query)
# 根据索引确定查询类型
query_type = "门诊" if i == 0 else "住院"
# 检查DataFrame是否为空
if df is None or df.empty:
log_message = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {output_path} {query_type} 查询无结果,跳过写入。"
logging.info(log_message)
continue # 跳过此次迭代
# 记录行数并写入日志
log_message = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - {output_path} {query_type} 查询行数: {len(df)}"
logging.info(log_message)
# 追加到CSV文件,检查文件是否存在以决定是否写入表头
if not os.path.exists(output_path):
df.to_csv(output_path, index=False, encoding='utf-8-sig', mode='w', header=True)
else:
df.to_csv(output_path, index=False, encoding='utf-8-sig', mode='a', header=False)
# 记录成功写入
log_message = f"{datetime.now().strftime('%Y-%m-%d %H:%M:%S')} - 成功将{query_type}查询结果写入{output_path}, 行数: {len(df)}"
logging.info(log_message)
# 示例用法
if __name__ == "__main__":
queries = ["SELECT * FROM iceberg.cdm.outpatient_record LIMIT 10;"]
output_path = "./patient.csv"
execute_queries_and_write_to_csv(queries, output_path)
import adbc_driver_manager
import adbc_driver_flightsql.dbapi as flight_sql
import pandas as pd
def execute_query(sql_query):
"""
执行SQL查询并返回结果as pandas DataFrame。
:param sql_query: str, SQL查询语句
:return: pandas DataFrame 包含查询结果
"""
try:
# 建立连接
conn = flight_sql.connect(uri="grpc://192.168.101.45:50802")
cursor = conn.cursor()
# 执行查询
cursor.execute(sql_query)
# 使用 fetchallarrow() 获取 Arrow 格式结果
result_arrow = cursor.fetchallarrow()
# 打印查询结果的行数
print("查询结果的行数:", result_arrow.num_rows)
# 将 Arrow 表转换为 pandas DataFrame
result_df = result_arrow.to_pandas()
return result_df
except Exception as e:
print(f"发生错误: {e}")
return None
finally:
# 确保连接关闭
if 'cursor' in locals():
cursor.close()
if 'conn' in locals():
conn.close()
# 使用示例
if __name__ == "__main__":
query = "SELECT * FROM iceberg.cdm.outpatient_record LIMIT 10;"
df = execute_query(query)
if df is not None:
print(df)
else:
print("Query failed to execute.")
import pandas as pd
import os
def deduplicate_csv_files():
# 获取当前目录下所有的.csv文件
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
for file in csv_files:
# 读取CSV文件
df = pd.read_csv(file, low_memory=False)
# 去重
df_deduplicated = df.drop_duplicates()
print(f"原始数据行数({file}): {len(df)}。去重后数据行数({file}): {len(df_deduplicated)}")
# 覆盖原始文件
df_deduplicated.to_csv(file, encoding='utf-8-sig', index=False)
print(f"去重后的数据已覆盖原文件:{file}")
# 调用函数
deduplicate_csv_files()
import duckdb
import pyarrow as pa
import pyarrow.flight as fl
import logging
import os
# 设置日志格式和级别,方便记录服务器运行中的相关信息
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class DuckDBFlightServer(fl.FlightServerBase):
def __init__(self, db_path):
# 使用 "grpc://0.0.0.0:8815" 作为 location 的格式,监听所有可用网络接口的8815端口
super().__init__(location="grpc://0.0.0.0:8815")
try:
self.connection = duckdb.connect(db_path)
logging.info(f"Successfully connected to DuckDB database at {db_path}")
except duckdb.Error as e:
logging.error(f"Failed to connect to DuckDB database: {e}")
raise # 重新抛出异常,避免程序继续执行出现问题
def do_get(self, context, ticket):
try:
query = ticket.ticket.decode('utf-8')
table = self.connection.execute(query).fetchdf()
# 将 DataFrame 转换为 Arrow Table
arrow_table = pa.Table.from_pandas(table)
# 使用 record_batches 方法返回结果
return pa.flight.RecordBatchStream(arrow_table)
except duckdb.Error as e:
logging.error(f"Error executing query in do_get: {e}")
context.set_error(fl.FlightError(
code=fl.FlightStatusCode.INTERNAL_SERVER_ERROR,
message="Database query error"
))
return None
except UnicodeDecodeError as e:
logging.error(f"Error decoding ticket in do_get: {e}")
context.set_error(fl.FlightError(
code=fl.FlightStatusCode.BAD_REQUEST,
message="Invalid ticket encoding"
))
return None
def do_put(self, context, descriptor, reader):
try:
# 这里简单示例将传入的数据解析后插入到名为 'your_table_name' 的表中,假设表结构和数据格式匹配
# 实际应用中需要根据具体业务需求和数据格式来准确处理写入逻辑
table_name = descriptor.path[0].decode('utf-8') # 获取表名(假设表名放在 descriptor.path 中,需按实际情况调整)
arrow_table = reader.read_all() # 读取所有传入的数据记录批次
df = arrow_table.to_pandas() # 转换为 pandas DataFrame
self.connection.execute(f"INSERT INTO {table_name} SELECT * FROM df")
logging.info(f"Successfully inserted data into table {table_name}")
context.set_success()
except duckdb.Error as e:
logging.error(f"Error executing put operation: {e}")
context.set_error(fl.FlightError(
code=fl.FlightStatusCode.INTERNAL_SERVER_ERROR,
message="Error during data insertion"
))
except UnicodeDecodeError as e:
logging.error(f"Error decoding table name: {e}")
context.set_error(fl.FlightError(
code=fl.FlightStatusCode.BAD_REQUEST,
message="Invalid table name encoding"
))
def main():
db_path = os.path.join(os.path.dirname(__file__), 'exported.duckdb')
server = DuckDBFlightServer(db_path)
print("Starting Flight Server on port 8815...")
try:
server.serve()
except Exception as e:
logging.error(f"Error starting the Flight Server: {e}")
if __name__ == "__main__":
main()
\ No newline at end of file
from flightsql import FlightSQLClient
import socket
import pandas as pd # 确保导入 pandas
from flightsql import __version__ as flightsql_version
print(f"flightsql 版本:{flightsql_version}")
def test_connection(host, port):
try:
with socket.create_connection((host, port), timeout=5):
print(f"成功连接到 {host}:{port}")
except OSError as e:
print(f"无法连接到 {host}:{port},错误: {e}")
# 创建 FlightSQLClient 实例
client = FlightSQLClient(host='192.168.101.45', port=50802, insecure=True, disable_server_verification=True, token=True)
# 测试连接
test_connection('192.168.101.45', 50802)
# 执行 SQL 查询并获取结果信息
info = client.execute("SELECT * FROM iceberg.cdm.outpatient_record LIMIT 10000000")
# 初始化一个空的列表来存储数据框
data_frames = []
# 遍历所有端点,获取数据并转换为 DataFrame
for endpoint in info.endpoints:
reader = client.do_get(endpoint.ticket)
data_frame = reader.read_all().to_pandas()
data_frames.append(data_frame)
# 合并所有数据框
final_data_frame = pd.concat(data_frames, ignore_index=True)
# 输出最终的数据
print(final_data_frame)
import pandas as pd
import os
def deduplicate_csv_files():
# 获取当前目录下所有的.csv文件
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
for file in csv_files:
# 读取CSV文件
df = pd.read_csv(file)
# 去重
df_deduplicated = df.drop_duplicates()
print(f"原始数据行数({file}): {len(df)}。去重后数据行数({file}): {len(df_deduplicated)}")
# 输出到新文件
new_filename = f"{os.path.splitext(file)[0]}_unique.csv"
df_deduplicated.to_csv(new_filename, index=False)
print(f"去重后的数据已保存到:{new_filename}")
# 调用函数
deduplicate_csv_files()
from data_query import *
queries = """
select DISTINCT numerical_value,normal_low,normal_high ,count(*) from iceberg.cdm.lab_report_result
where test_item_name = 'BP' group by test_item_name,numerical_value,normal_low,normal_high
"""
"""
queries =
select DISTINCT test_item_name,count(*) from iceberg.cdm.lab_report_result
where test_item_name ~* '血压|舒张压|收缩压|BP|SBP|DBP' group by test_item_name
"""
# 调用函数
a = execute_query(queries)
print(a)
\ No newline at end of file
-- 数据探查
select count(*) from "DIABETES_DIAGNOSIS"; -- 220879
select count(*) from "DIABETES_FEE_DETAIL"; -- 5483603
select count(*) from "DIABETES_LAB_RESULT_CM"; -- 1530680
select count(*) from "DIABETES_PATIENT"; -- 5377
select count(*) from "DIABETES_PRESCRIBING"; -- 1113684
select count(*) from "DIABETES_VISIT"; -- 96176
--
-- 创建表
create table in2_diagnosis as (
select * from "DIABETES_DIAGNOSIS" );
create table in2_patient as (
select * from "DIABETES_PATIENT" );
create table in2_visit as (
select * from "DIABETES_VISIT" );
create table in2_prescribing as (
select * from "DIABETES_PRESCRIBING" );
create table in2_lab_result_cm as (
select * from "DIABETES_LAB_RESULT_CM" );
-- ******************************************************************* 数据要处理成和二型糖尿病一致的格式
-- patient
select distinct a.patient_id from in2_patient a join in2_diagnosis b on a.patient_id = b.patient_id
SELECT patient_id,REPLACE(patient_id, 'DT_', '') AS patient_id_cleaned
FROM in2_patient;
-- 查看分组
select sex,count(*) from in2_patient group by sex;
-- 新增一例
ALTER TABLE in2_patient ADD COLUMN std_sex VARCHAR(10);
-- 更新数据
update in2_patient set std_sex = sex;
-- 修改id
update in2_patient set patient_id = REPLACE(patient_id, 'DT_', '');
select std_sex,count(*) from in2_patient group by std_sex;
-- 日期格式转换下
SELECT birth_date,TO_TIMESTAMP(REPLACE(birth_date, '/', '-'), 'YYYY-MM-DD') from
in2_patient
update in2_patient set birth_date = TO_TIMESTAMP(REPLACE(birth_date, '/', '-'), 'YYYY-MM-DD');
SELECT birth_date,(birth_date::timestamptz)::timestamp from in2_patient;
update in2_patient set birth_date =(birth_date::timestamptz)::timestamp;
-- visit 表
select * from in2_visit
SELECT admission_datetime,
TO_TIMESTAMP(REPLACE(admission_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore' AS converted_datetime
FROM in2_visit;
SELECT discharge_datetime,
TO_TIMESTAMP(REPLACE(discharge_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore' AS converted_datetime
FROM in2_visit;
update in2_visit set admission_datetime = TO_TIMESTAMP(REPLACE(admission_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore';
update in2_visit set discharge_datetime = TO_TIMESTAMP(REPLACE(discharge_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore';
-- diagnosis表
select * from in2_diagnosis;
SELECT diagnosis_datetime,
TO_TIMESTAMP(REPLACE(diagnosis_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore' AS converted_datetime
FROM in2_diagnosis;
update in2_diagnosis set diagnosis_datetime = TO_TIMESTAMP(REPLACE(diagnosis_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore';
-- update in2_diagnosis set dx_desc = raw_dx_desc;
update in2_diagnosis set raw_dx_desc = null;
-- 修改列名
ALTER TABLE in2_diagnosis
RENAME COLUMN raw_dx_desc TO std_dx_desc;
-- 新增icd10_code 和 icd10_name
ALTER TABLE in2_diagnosis ADD COLUMN icd10_code VARCHAR(30);
ALTER TABLE in2_diagnosis ADD COLUMN icd10_name VARCHAR(30);
update in2_diagnosis set icd10_code = dx;
update in2_diagnosis set icd10_name = dx_desc;
-- 诊断标化
--数据探索
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* ''
group by dx_desc order by count(*) desc;
-- 1 背景性视网膜病变
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '白内障|视网膜|眼|黄斑|玻璃体|玻血|网脱|失明|弱视|视力'
and dx_desc ~* '背景性'
group by dx_desc order by count(*) desc; -- 3
-- 2 视网膜病变
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '白内障|视网膜|眼|黄斑|玻璃体|玻血|网脱|失明|弱视|视力'
and dx_desc ~* '增殖性'
group by dx_desc order by count(*) desc; -- 8
-- 3 黄斑水肿
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '黄斑水肿'
group by dx_desc order by count(*) desc; -- 1
-- 4 重度视觉丧失
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '失明|眼球?萎缩|眼球?缺失|盲目(3|三)|(视力|视觉)重度'
group by dx_desc order by count(*) desc; -- 2
-- 5 症状性神经病变
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '神经'
and dx_desc ~* '神经炎|神经痛|晕|麻|乏|幻|蚁|虫|触电|肌|腕管|植物神经|神经血管|直立性低血压|功能性腹泻|夏科'
group by dx_desc order by count(*) desc; -- 34
-- 6 外周血管病
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '(颈|髂总|髂内|肾脏|肢端|腹主|肢|肾小)主?动脉(粥样硬|痉挛|坏疽|硬|瘤|炎|栓塞|血栓)化?|间歇性?跛行|红斑性肢痛|(伯|柏)格|雷诺氏|周围血管疾?病|动脉(肌纤维发育异|坏疽|痉挛)|主动脉(瘤|炎)|主动脉(粥样)?硬化|(静脉)?曲张|血栓性静脉|下肢(深静脉血栓|静脉曲张|动脉闭塞|血栓性静脉炎|静脉功能不全|静脉炎|(血管|动脉)闭塞症|静脉肌间血栓形成)|周围循环'
and not dx_desc ~* '精索静脉曲张|颈动脉硬化'
group by dx_desc order by count(*) desc; -- 41
-- 7 下肢截肢
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '截肢|切断|截断|截'
and dx_desc ~* '腿|下肢|足'
group by dx_desc order by count(*) desc; -- 2
-- 8 微量白蛋白尿
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '微.{0,3}蛋白尿|蛋白尿.{0,3}微'
group by dx_desc order by count(*) desc; -- 1
-- 9 大量白蛋白尿
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '大.{0,3}蛋白尿|蛋白尿.{0,3}大'
group by dx_desc order by count(*) desc; -- 1
-- 10 终末期肾病
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '肾?移植|透析?|尿毒症|CKD(5|Ⅴ|五)|肾.{0,4}终末|终末.{0,4}肾'
group by dx_desc order by count(*) desc; -- 13
-- 11 缺血性心脏病
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '冠(心病|状|脉)|旁路移植|搭桥|多支|PCI|心绞痛|动脉硬化.{0,3}心脏病|心(肌|脏)(缺|供)血|缺血性心(脏|肌)病|心肌?梗'
and dx_desc ~* '心梗|心肌梗死|心痛|陈旧(性|型)?(心|ST|非ST|Q|前|侧|下|高|间|广泛|(左|右)心室)|心肌梗塞|胸痹'
group by dx_desc order by count(*) desc; -- 17
-- 12 心肌梗死
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '心梗|心肌梗死|心痛|陈旧(性|型)?(心|ST|非ST|Q|前|侧|下|高|间|广泛|(左|右)心室)|心肌梗塞|胸痹'
and not dx_desc ~* '陈旧(性|型)?|恢复期|个人史'
group by dx_desc order by count(*) desc; -- 12
-- 13 卒中
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '脑.{0,3}(梗|塞|死)|卒中|中风'
and not dx_desc ~* '腔隙性脑.{0,3}(梗|塞|死)|脑.{0,3}(梗|塞|死|卒中)(后遗症|个人史|恢复期)'
group by dx_desc order by count(*) desc; -- 17
-- 14 心力衰竭
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '心(力|室|房)?衰(竭)?|心功能不全|心功能.*级|心源性哮喘|低心排综合征|KILLIP.*级'
and not dx_desc ~* '肾(功能)?衰'
group by dx_desc order by count(*) desc; -- 22
-- 15 高血压
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* 'HBP|高血压'
and not dx_desc ~* '假性高血压'
group by dx_desc order by count(*) desc; -- 45
-- 16 血脂异常
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '血脂异常|(胆固醇|高脂|甘油三?(脂|酯))血症|高血脂|高粘血症|高(密度)?(酯|脂)蛋白|低(密度)?(酯|脂)蛋白|高?三酰甘油'
group by dx_desc order by count(*) desc; -- 6
-- 17 妊娠糖尿病
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '妊娠期.{0,3}糖尿病'
group by dx_desc order by count(*) desc; -- 6
-- 18 1型糖尿病
select dx_desc,count(*) from in2_diagnosis
where dx_desc ~* '(Ⅰ|I|1|一|胰岛素依赖).?糖尿病|糖尿病.?(Ⅰ|I|1|一|胰岛素依赖)型'
group by dx_desc order by count(*) desc; -- 41
-- 数据标化
update in2_diagnosis set
std_dx_desc = null
;
-- 1 心肌梗死
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), ',')
WHERE
;
-- 1 背景性视网膜病变
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '背景性视网膜病变,')
WHERE dx_desc ~* '白内障|视网膜|眼|黄斑|玻璃体|玻血|网脱|失明|弱视|视力'
and dx_desc ~* '背景性';
-- 2 增殖性视网膜病变
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '增殖性视网膜病变,')
WHERE dx_desc ~* '白内障|视网膜|眼|黄斑|玻璃体|玻血|网脱|失明|弱视|视力'
and dx_desc ~* '增殖性';
-- 3 黄斑水肿
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '黄斑水肿,')
WHERE dx_desc ~* '黄斑水肿' ;
-- 4 重度视觉丧失
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '重度视觉丧失,')
WHERE dx_desc ~* '失明|眼球?萎缩|眼球?缺失|盲目(3|三)|(视力|视觉)重度' ;
-- 5 症状性神经病变
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '症状性神经病变,')
WHERE dx_desc ~* '神经'
and dx_desc ~* '神经炎|神经痛|晕|麻|乏|幻|蚁|虫|触电|肌|腕管|植物神经|神经血管|直立性低血压|功能性腹泻|夏科';
-- 6 外周血管病
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '外周血管病,')
WHERE dx_desc ~* '(颈|髂总|髂内|肾脏|肢端|腹主|肢|肾小)主?动脉(粥样硬|痉挛|坏疽|硬|瘤|炎|栓塞|血栓)化?|间歇性?跛行|红斑性肢痛|(伯|柏)格|雷诺氏|周围血管疾?病|动脉(肌纤维发育异|坏疽|痉挛)|主动脉(瘤|炎)|主动脉(粥样)?硬化|(静脉)?曲张|血栓性静脉|下肢(深静脉血栓|静脉曲张|动脉闭塞|血栓性静脉炎|静脉功能不全|静脉炎|(血管|动脉)闭塞症|静脉肌间血栓形成)|周围循环'
and not dx_desc ~* '精索静脉曲张|颈动脉硬化';
-- 7 下肢截肢
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '下肢截肢,')
WHERE dx_desc ~* '截肢|切断|截断|截'
and dx_desc ~* '腿|下肢|足';
-- 8 微量白蛋白尿
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '微量白蛋白尿,')
WHERE dx_desc ~* '微.{0,3}蛋白尿|蛋白尿.{0,3}微' ;
-- 9 大量白蛋白尿
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '大量白蛋白尿,')
WHERE dx_desc ~* '大.{0,3}蛋白尿|蛋白尿.{0,3}大' ;
-- 10 终末期肾病
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '终末期肾病,')
WHERE dx_desc ~* '肾?移植|透析?|尿毒症|CKD(5|Ⅴ|五)|肾.{0,4}终末|终末.{0,4}肾' ;
-- 11 缺血性心脏病
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '缺血性心脏病,')
WHERE dx_desc ~* '冠(心病|状|脉)|旁路移植|搭桥|多支|PCI|心绞痛|动脉硬化.{0,3}心脏病|心(肌|脏)(缺|供)血|缺血性心(脏|肌)病|心肌?梗'
and dx_desc ~* '心梗|心肌梗死|心痛|陈旧(性|型)?(心|ST|非ST|Q|前|侧|下|高|间|广泛|(左|右)心室)|心肌梗塞|胸痹';
-- 12 心肌梗死
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '心肌梗死,')
WHERE dx_desc ~* '心梗|心肌梗死|心痛|陈旧(性|型)?(心|ST|非ST|Q|前|侧|下|高|间|广泛|(左|右)心室)|心肌梗塞|胸痹'
and not dx_desc ~* '陈旧(性|型)?|恢复期|个人史';
-- 13 卒中
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '卒中,')
WHERE dx_desc ~* '脑.{0,3}(梗|塞|死)|卒中|中风'
and not dx_desc ~* '腔隙性脑.{0,3}(梗|塞|死)|脑.{0,3}(梗|塞|死|卒中)(后遗症|个人史|恢复期)';
-- 14 心力衰竭
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '心力衰竭,')
WHERE dx_desc ~* '心(力|室|房)?衰(竭)?|心功能不全|心功能.*级|心源性哮喘|低心排综合征|KILLIP.*级'
and not dx_desc ~* '肾(功能)?衰';
-- 15 高血压
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '高血压,')
WHERE dx_desc ~* 'HBP|高血压'
and not dx_desc ~* '假性高血压';
-- 16 血脂异常
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '血脂异常,')
WHERE dx_desc ~* '血脂异常|(胆固醇|高脂|甘油三?(脂|酯))血症|高血脂|高粘血症|高(密度)?(酯|脂)蛋白|低(密度)?(酯|脂)蛋白|高?三酰甘油' ;
-- 17 妊娠糖尿病
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '妊娠糖尿病,')
WHERE dx_desc ~* '妊娠期.{0,3}糖尿病' ;
-- 18 1型糖尿病
update in2_diagnosis set
std_dx_desc = CONCAT_WS('', COALESCE(std_dx_desc, ''), '1型糖尿病,')
WHERE dx_desc ~* '(Ⅰ|I|1|一|胰岛素依赖).?糖尿病|糖尿病.?(Ⅰ|I|1|一|胰岛素依赖)型';
-- 删除最后的逗号
UPDATE in2_diagnosis
SET std_dx_desc = TRIM(TRAILING ',' FROM std_dx_desc);
select std_dx_desc,count(*) from in2_diagnosis group by std_dx_desc;
-- 用药
select * from in2_prescribing;
-- 时间格式转换
SELECT order_datetime,
TO_TIMESTAMP(REPLACE(order_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore' AS converted_datetime
FROM in2_prescribing;
SELECT rx_start_datetime,
TO_TIMESTAMP(REPLACE(rx_start_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore' AS converted_datetime
FROM in2_prescribing;
SELECT rx_end_datetime,
TO_TIMESTAMP(REPLACE(rx_end_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore' AS converted_datetime
FROM in2_prescribing;
update in2_prescribing set order_datetime = TO_TIMESTAMP(REPLACE(order_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore';
update in2_prescribing set rx_start_datetime = TO_TIMESTAMP(REPLACE(rx_start_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore';
update in2_prescribing set rx_end_datetime = TO_TIMESTAMP(REPLACE(rx_end_datetime, '/', ' '), 'YYYY-MM-DD HH24:MI:SS') AT TIME ZONE 'Asia/Singapore';
-- 新增列 药品执行天数
ALTER TABLE in2_prescribing ADD COLUMN days_supply VARCHAR(10);
-- 医嘱执行天数更新
update in2_prescribing set days_supply = EXTRACT(DAY FROM (to_timestamp(rx_end_datetime, 'YYYY-MM-DD HH24:MI:SS') - to_timestamp(rx_start_datetime, 'YYYY-MM-DD HH24:MI:SS')));
SELECT rx_end_datetime,rx_start_datetime,
EXTRACT(DAY FROM (to_timestamp(rx_end_datetime, 'YYYY-MM-DD HH24:MI:SS') - to_timestamp(rx_start_datetime, 'YYYY-MM-DD HH24:MI:SS'))) AS days_difference
FROM
in2_prescribing;
-- 新增标化列
ALTER TABLE in2_prescribing ADD COLUMN std_rx_desc VARCHAR(30);
select rx_desc,count(*) from in2_prescribing
where rx_desc ~* ''
group by rx_desc order by count(*) desc;
-- 1 磺脲类
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '格列(本脲|吡嗪|喹酮|齐特|美脲|波脲)|甲苯磺丁脲|氯磺丙脲|优降糖|达安疗|美吡达|瑞易宁|秦苏|迪沙|依吡达|优哒灵|元坦|麦林格|唐贝克|曼迪宝|美吡达|糖适平|捷适|达美康|弗莱因|弘旭阳|谐尔平|亚莫利|万苏平|佑苏|力贻苹|迪北|安多美|科德平|伊瑞|佳和洛|普仁平|克糖利'
group by rx_desc,std_rx_desc order by count(*) desc; -- 26
-- 2 格列奈类
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '(瑞|那|米)格列奈|诺和龙|弗来迪|唐力|唐瑞|贝加|快如妥'
group by rx_desc,std_rx_desc order by count(*) desc; -- 8
-- 3 双胍类
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '二甲双胍|甲福明|格华止|奈达|泰白|至力|倍顺|麦克罗辛|麦特美|唐必呋|亿恒|仁欣|悦达宁|力乐尔|卜可|迪化唐锭|美迪康|君士达新|唐落|山姆士|君力达'
and not rx_desc ~* '吡嗪|本脲|注射|列汀|列净'
group by rx_desc,std_rx_desc order by count(*) desc; -- 22
-- 4 噻唑烷二酮类
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '(罗|吡)格列酮|文迪雅|奥洛华|爱能|太罗|维戈洛|宜力喜|圣敏|耐迪|安瑞宁|艾可拓|卡司平|顿灵|贝唐宁|佳普喜|安可妥|凯宝维元|艾汀|卡司平|瑞彤|列洛|夷友'
and not rx_desc ~* '双胍'
group by rx_desc,std_rx_desc order by count(*) desc; -- 8
-- 5 α-糖苷酶抑制剂类
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '(阿卡|伏格列)波糖|米格列醇|拜唐苹|卡博平|贝希|倍欣|华怡平|德赛天|米格尼醇|Glyset|奥恬苹|瑞舒'
and not rx_desc ~* '双胍'
group by rx_desc,std_rx_desc order by count(*) desc; -- 23
-- 6 DPP4i
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '(西|维|沙|利|阿)格列汀'
and not rx_desc ~* '双胍'
group by rx_desc,std_rx_desc order by count(*) desc; -- 6
-- 7 SGLT2i
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '(达|恩|坎|伊|托)格列净'
and not rx_desc ~* '双胍'
group by rx_desc,std_rx_desc order by count(*) desc; -- 7
-- 8 复方制剂 没有 * ***************************************************************
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where (rx_desc ~* '(西|沙|维|利)格列汀|恩格列净|欧唐静'
and rx_desc ~* '二甲双胍|甲福明|格华止|奈达|泰白|至力|倍顺|麦克罗辛|麦特美|唐必呋|亿恒|仁欣|悦达宁|力乐尔|卜可|迪化唐锭|美迪康|君士达新|唐落|山姆士|君力达')
or rx_desc ~* '欧双(宁|静)|捷诺达|宜合瑞|安立格'
group by rx_desc,std_rx_desc order by count(*) desc; -- 5
-- 9 基础胰岛素
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
and rx_desc ~* '精蛋白锌重组人|低精蛋白锌|精蛋白生物合成人|甘精|地特|(徳|德)谷|(优泌林|重和林|甘舒霖|诺和灵).?N|诺和达|NPH|诺和平|来得时|长秀霖|优乐灵|糖德仕|中效|^精蛋白人胰岛素$'
and not rx_desc ~* '注射器针头|注射针头|射器|针头|泵耗材|混合|30R|50R|70R|25R|预混|门冬|70\/30|诺和灵R|^精蛋白锌重组人胰岛素注射液|\(\)精蛋白锌重组人胰岛素注射液|\(N笔芯\)精蛋白生物合成人胰岛素'
group by rx_desc,std_rx_desc order by count(*) desc; -- 13
-- 10 预混胰岛素
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
and rx_desc ~* '30R|30R|50R|30\/70|预混|混(和|合)|(甘舒霖|万苏林).?(30|40|50)R|诺和灵.?50R|优泌林.?70\/30|重和林.?M30|优思灵.?(30\/70)|(优泌乐|诺和锐)(50|25|30)|25|50|70|30注射液|胰岛素30'
and not rx_desc ~* '注射器针头|注射针头|射器|针头|泵耗材|502181|甘精胰岛素'
group by rx_desc,std_rx_desc order by count(*) desc; -- 14
-- 11 餐时胰岛素
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
and rx_desc ~* '万邦|诺和(佳|锐)[^3005]|优泌乐|速秀霖|艾倍得|(诺和灵|优思灵|甘舒霖|重和林|优泌林).?R|(门冬双?|赖脯|谷赖)胰岛素|优泌林|^.?胰岛素注射液|^胰岛素$|重组人胰岛素|^人胰岛素|^诺和锐$|.{0,2}?(中|跌|基|门诊).?(生物合成人)?胰岛素注射液|优泌乐|^★胰岛素$|^胰岛素.?\*|速效|R.{0,4}?生物合成人胰岛素注射液|(精蛋白人重组胰岛素注射液|胰岛素).{0,2}自备|R笔?芯.{0,2}(生物合成人胰岛素|人胰岛素注射液)|^停\*胰岛素注射液$'
and not rx_desc ~* '注射器针头|注射针头|射器|针头|泵耗材|混(和|合)|30R|50R|70R|25|预混|70\/30|(胰岛素|优泌乐)(30|50)|M30|诺和灵N|中效|甘精胰岛素|^(德|徳)谷胰岛素|精蛋白人胰岛素【精蛋白生物合成人胰岛素】【精蛋白锌重组人胰岛素】【精蛋白重组人胰岛素】|德谷门冬双胰岛素'
group by rx_desc,std_rx_desc order by count(*) desc; -- 11
-- 12 GLP-1
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '(艾塞那|利司那|贝那鲁|利拉鲁|聚乙二醇洛塞那|司美格鲁|度拉糖)肽|百泌达|百达扬|利时敏|谊生泰|诺和力|弗来美|诺和泰|度易达'
group by rx_desc,std_rx_desc order by count(*) desc; -- 6
-- 13 双联(Dual) 缺失 ***************************************************************************
select rx_desc,std_rx_desc,count(*) from in2_prescribing
where rx_desc ~* '德谷门冬双胰岛素'
group by rx_desc,std_rx_desc order by count(*) desc; -- 1
-- 数据缺失 需要新增药物之后在补充后在标化
update in2_prescribing
set
std_rx_desc = null;
update in2_prescribing
set
std_rx_desc =
case
-- 磺脲类
when rx_desc ~* '格列(本脲|吡嗪|喹酮|齐特|美脲|波脲)|甲苯磺丁脲|氯磺丙脲|优降糖|达安疗|美吡达|瑞易宁|秦苏|迪沙|依吡达|优哒灵|元坦|麦林格|唐贝克|曼迪宝|美吡达|糖适平|捷适|达美康|弗莱因|弘旭阳|谐尔平|亚莫利|万苏平|佑苏|力贻苹|迪北|安多美|科德平|伊瑞|佳和洛|普仁平|克糖利'
then '磺脲类'
-- 格列奈类
when rx_desc ~* '(瑞|那|米)格列奈|诺和龙|弗来迪|唐力|唐瑞|贝加|快如妥'
then '格列奈类'
-- 双胍类
when rx_desc ~* '二甲双胍|甲福明|格华止|奈达|泰白|至力|倍顺|麦克罗辛|麦特美|唐必呋|亿恒|仁欣|悦达宁|力乐尔|卜可|迪化唐锭|美迪康|君士达新|唐落|山姆士|君力达'
and not rx_desc ~* '吡嗪|本脲|注射|列汀|列净'
then '双胍类'
-- 噻唑烷二酮类
when rx_desc ~* '(罗|吡)格列酮|文迪雅|奥洛华|爱能|太罗|维戈洛|宜力喜|圣敏|耐迪|安瑞宁|艾可拓|卡司平|顿灵|贝唐宁|佳普喜|安可妥|凯宝维元|艾汀|卡司平|瑞彤|列洛|夷友'
and not rx_desc ~* '双胍'
then '噻唑烷二酮类'
-- α-糖苷酶抑制剂类
when rx_desc ~* '(阿卡|伏格列)波糖|米格列醇|拜唐苹|卡博平|贝希|倍欣|华怡平|德赛天|米格尼醇|Glyset|奥恬苹|瑞舒'
and not rx_desc ~* '双胍'
then 'α-糖苷酶抑制剂类'
-- DPP4i
when rx_desc ~* '(西|维|沙|利|阿)格列汀'
and not rx_desc ~* '双胍'
then 'DPP4i'
-- SGLT2i
when rx_desc ~* '(达|恩|坎|伊|托)格列净'
and not rx_desc ~* '双胍'
then 'SGLT2i'
-- 复方制剂
when (rx_desc ~* '(西|沙|维|利)格列汀|恩格列净|欧唐静'
and rx_desc ~* '二甲双胍|甲福明|格华止|奈达|泰白|至力|倍顺|麦克罗辛|麦特美|唐必呋|亿恒|仁欣|悦达宁|力乐尔|卜可|迪化唐锭|美迪康|君士达新|唐落|山姆士|君力达')
or rx_desc ~* '欧双(宁|静)|捷诺达|宜合瑞|安立格'
then '复方制剂'
-- 基础胰岛素
when rx_desc ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
and rx_desc ~* '精蛋白锌重组人|低精蛋白锌|精蛋白生物合成人|甘精|地特|(徳|德)谷|(优泌林|重和林|甘舒霖|诺和灵).?N|诺和达|NPH|诺和平|来得时|长秀霖|优乐灵|糖德仕|中效|^精蛋白人胰岛素$'
and not rx_desc ~* '注射器针头|注射针头|射器|针头|泵耗材|混合|30R|50R|70R|25R|预混|门冬|70\/30|诺和灵R|^精蛋白锌重组人胰岛素注射液|\(\)精蛋白锌重组人胰岛素注射液|\(N笔芯\)精蛋白生物合成人胰岛素'
then '基础胰岛素'
-- 预混胰岛素
when rx_desc ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
and rx_desc ~* '30R|30R|50R|30\/70|预混|混(和|合)|(甘舒霖|万苏林).?(30|40|50)R|诺和灵.?50R|优泌林.?70\/30|重和林.?M30|优思灵.?(30\/70)|(优泌乐|诺和锐)(50|25|30)|25|50|70|30注射液|胰岛素30'
and not rx_desc ~* '注射器针头|注射针头|射器|针头|泵耗材|502181|甘精胰岛素'
then '预混胰岛素'
-- 餐时胰岛素
when rx_desc ~* '胰岛素|(重和|万苏)林|甘舒霖|优泌(林|乐)|NPH|艾倍得|糖德仕|来得时|优(思|乐)灵|(速|长)秀霖|诺和(锐|平|佳|达|灵)'
and rx_desc ~* '万邦|诺和(佳|锐)[^3005]|优泌乐|速秀霖|艾倍得|(诺和灵|优思灵|甘舒霖|重和林|优泌林).?R|(门冬双?|赖脯|谷赖)胰岛素|优泌林|^.?胰岛素注射液|^胰岛素$|重组人胰岛素|^人胰岛素|^诺和锐$|.{0,2}?(中|跌|基|门诊).?(生物合成人)?胰岛素注射液|优泌乐|^★胰岛素$|^胰岛素.?\*|速效|R.{0,4}?生物合成人胰岛素注射液|(精蛋白人重组胰岛素注射液|胰岛素).{0,2}自备|R笔?芯.{0,2}(生物合成人胰岛素|人胰岛素注射液)|^停\*胰岛素注射液$'
and not rx_desc ~* '注射器针头|注射针头|射器|针头|泵耗材|混(和|合)|30R|50R|70R|25|预混|70\/30|(胰岛素|优泌乐)(30|50)|M30|诺和灵N|中效|甘精胰岛素|^(德|徳)谷胰岛素|精蛋白人胰岛素【精蛋白生物合成人胰岛素】【精蛋白锌重组人胰岛素】【精蛋白重组人胰岛素】|德谷门冬双胰岛素'
then '餐时胰岛素'
-- GLP-1
when rx_desc ~* '(艾塞那|利司那|贝那鲁|利拉鲁|聚乙二醇洛塞那|司美格鲁|度拉糖)肽|百泌达|百达扬|利时敏|谊生泰|诺和力|弗来美|诺和泰|度易达'
then 'GLP-1'
-- 双联(Dual)
when rx_desc ~* '德谷门冬双胰岛素'
then '双联(Dual)'
else null
end;
select std_rx_desc,count(*) from in2_prescribing
group by std_rx_desc order by count(*) desc; -- 14
select * from in2_prescribing
-- 体征表 缺失 需要模拟体征表 **********************************************************************************
select * from vital;
CREATE TABLE in2_vital as (
select distinct patient_id,visit_id,patient_type,measure_datetime,'舒张压' as item_name,item_value_addition as item_value,item_uom from vital where item_value_addition is not null
union all
select distinct patient_id,visit_id,patient_type,measure_datetime,'收缩压' as item_name,item_value,item_uom from vital where item_value_addition is not null
union all
select distinct patient_id,visit_id,patient_type,measure_datetime,item_name,item_value,item_uom from vital where item_value_addition is null
);
select * from in2_vital;
-- 化验表
select * from in2_lab_result_cm where lab_item_name ~* '葡萄糖' and lab_item_name ~* '2小时';
ALTER TABLE in2_lab_result_cm ADD COLUMN std_lab_item_name VARCHAR(30);
-- FPG
select lab_item_name,std_lab_item_name,specimen_source,result_unit,count(*) from in2_lab_result_cm
where lab_item_name ~* '空腹|FPG'
and specimen_source ~* '血'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 2
-- HbA1c
select lab_item_name,specimen_source,result_unit,std_lab_item_name,count(*) from in2_lab_result_cm
where lab_item_name ~* 'HbA1c|糖化血红蛋白'
and specimen_source ~* '血'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 2
-- 葡萄糖负荷2小时血糖
select lab_item_name,specimen_source,result_unit,std_lab_item_name,count(*) from in2_lab_result_cm
where lab_item_name ~* '葡萄糖'
and lab_item_name ~* '[^餐后]2小时'
and specimen_source ~* '血'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 3
-- 空腹C肽
select lab_item_name,specimen_source,result_unit,std_lab_item_name,count(*) from in2_lab_result_cm
where lab_item_name ~* '^C肽$' and result_unit = 'ng/mL'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 1
-- 餐后1小时C肽
select lab_item_name,specimen_source,result_unit,std_lab_item_name,count(*) from in2_lab_result_cm
where lab_item_name ~* 'C肽'
and lab_item_name ~* '1'
and specimen_source ~* '血'
and result_unit = 'nmol/L'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 2
-- 餐后2小时C肽
select lab_item_name,std_lab_item_name,specimen_source,result_unit,count(*) from in2_lab_result_cm
where lab_item_name ~* 'C肽'
and lab_item_name ~* '2'
and specimen_source ~* '血'
and result_unit = 'nmol/L'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 2
-- 餐后3小时C肽 ************************** 缺失 *****
select lab_item_name,specimen_source,result_unit,std_lab_item_name,count(*) from in2_lab_result_cm
where lab_item_name ~* 'C肽'
and lab_item_name ~* '3'
and specimen_source ~* '血'
and result_unit = 'nmol/L'
group by lab_item_name,specimen_source,result_unit,std_lab_item_name order by count(*); -- 2
-- 化验标化
update in2_lab_result_cm
set
std_lab_item_name = null;
update in2_lab_result_cm
set
std_lab_item_name =
case
-- FPG
when lab_item_name ~* '空腹|FPG' and specimen_source ~* '血'
then 'FPG'
-- HbA1c
when lab_item_name ~* 'HbA1c|糖化血红蛋白' and specimen_source ~* '血'
then 'HbA1c'
-- 葡萄糖负荷2小时血糖
when lab_item_name ~* '葡萄糖' and lab_item_name ~* '[^餐后]2小时' and specimen_source ~* '血'
then '葡萄糖负荷2小时血糖'
-- 空腹C肽
when lab_item_name ~* '^C肽$' and result_unit = 'ng/mL'
then '空腹C肽'
-- 餐后1小时C肽
when lab_item_name ~* 'C肽' and lab_item_name ~* '1' and specimen_source ~* '血' and result_unit = 'nmol/L'
then '餐后1小时C肽'
-- 餐后2小时C肽
when lab_item_name ~* 'C肽' and lab_item_name ~* '2' and specimen_source ~* '血' and result_unit = 'nmol/L'
then '餐后2小时C肽'
-- 餐后3小时C肽
when lab_item_name ~* 'C肽' and lab_item_name ~* '3' and specimen_source ~* '血' and result_unit = 'nmol/L'
then '餐后3小时C肽'
else null
end;
select std_lab_item_name,count(*) from in2_lab_result_cm
group by std_lab_item_name order by count(*); -- 2
-- 定性定量标化 ****************************************************************************************************************************************************************************************
-- 数据标化后 只针对裱花结果对定性定量标化
select result_num,result_text from in2_lab_result_cm
where std_lab_item_name is not null
group by result_num,result_text;
select result_num,result_text from in2_lab_result_cm
where std_lab_item_name is not null and result_num is null
group by result_num,result_text;
UPDATE in2_lab_result_cm
SET result_num = result_text
where std_lab_item_name is not null and result_num is null
and result_text ~* '^[0-9]+(\.[0-9]+)?';
select result_num,result_text,REGEXP_REPLACE(result_text, '<', '', 'g') as c from in2_lab_result_cm
where std_lab_item_name is not null and result_num is null
group by result_num,result_text;
UPDATE in2_lab_result_cm
SET result_num = REGEXP_REPLACE(result_text, '<', '', 'g')
where std_lab_item_name is not null and result_num is null;
\ No newline at end of file
-- 查询表格前面要加上 iceberg.cdm.
-- 如果人数过多 可以一年提取
-- 分析时候用的标黄的列 关联表用到科室
-- 提取数据按照人提取 时间段为 2021.01.01至2024.12.31 也可以改为 2019-2022
-- 标准语法参考地址 :https://datafusion.apache.org/user-guide/sql/select.html#select-clause
-- 1. visit 表
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
patient_source_id as patient_id,
date_of_visiting as admission_datetime,
specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.outpatient_record
where date_of_visiting >= to_date('20210101', 'yyyyMMdd') and date_of_visiting <= to_date('20241231', 'yyyyMMdd')
union all
select distinct
inpatient_source_no as visit_id,
patient_source_id as patient_id,
admi_time as admission_datetime,
admi_specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.inpat_record
where admi_time >= to_date('20210101', 'yyyyMMdd') and admi_time <= to_date('20241231', 'yyyyMMdd')
),
t2 as
(select distinct b.patient_id,c.visit_id,c.org_code,c.admission_datetime,c.specialty from iceberg.cdm.patient_diagnosis a
join
t1 b on a.org_code = b.provider_id and a.visit_source_no = b.visit_id
join
t1 c on b.org_code = c.provider_id and b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%")
select * from t2;
-- 第二版visit 2025-01-06 住院和门诊的时间可以转换为统一格式 先查10条语句是否正确 “LIMIT 10”
WITH t1 AS (
SELECT DISTINCT
visit_record_id AS visit_id,
outpatient_record_id AS patient_id,
date_of_visiting AS admission_datetime,
null as discharge_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = 'true' THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM
iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'
union all
SELECT DISTINCT
visit_record_id as visit_id,
inpat_record_id as patient_id,
admission_time as admission_datetime,
admission_specialty_name as specialty,
discharge_time as discharge_datetime,
'住院' as patient_type
FROM
iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND discharge_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
),
t2 as
(select distinct
b.patient_id,
c.visit_id,
c.admission_datetime,
c.discharge_datetime,
c.specialty,
c.patient_type
from iceberg.cdm.patient_diagnosis a
join
t1 b on a.visit_record_id = b.visit_id
join
t1 c on b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%")
select * from t2;
-- 2. patient 表
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
patient_source_id as patient_id,
org_code as provider_id
from iceberg.cdm.outpatient_record
where date_of_visiting >= to_date('20210101', 'yyyyMMdd') and date_of_visiting <= to_date('20241231', 'yyyyMMdd')
union all
select distinct
inpatient_source_no as visit_id,
patient_source_id as patient_id,
org_code as provider_id
from iceberg.cdm.inpat_record
where admi_time >= to_date('20210101', 'yyyyMMdd') and admi_time <= to_date('20241231', 'yyyyMMdd')
),
t2 as
(select distinct
c.patient_source_id as patient_id,
c.gender_name as sex,
c.date_of_birth as birth_date,
c.org_code as provider_id
from iceberg.cdm.patient_diagnosis a
join
t1 b on a.org_code = b.provider_id and a.visit_source_no = b.visit_id
join
iceberg.cdm.patient_base_info c on b.org_code = c.org_code and b.patient_id = c.patient_source_id
where a.diagnosis_name like "%糖尿病%"
)
select * from t2;
-- 限制时间用出院时间
-- 2.第二版 patient 表 2025-01-06
WITH t1 AS (
SELECT DISTINCT
visit_record_id AS visit_id,
outpatient_record_id AS patient_id
END AS patient_type
FROM
iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'
union all
SELECT DISTINCT
visit_record_id as visit_id,
inpat_record_id as patient_id
FROM
iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND discharge_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
),
t2 as
(select distinct
c.patient_source_id as patient_id,
c.gender_standard_name as sex,
c.date_of_birth as birth_date
from iceberg.cdm.patient_diagnosis a
join
t1 b on a.visit_record_id = b.visit_id
join iceberg.cdm.patient_base_info c
on b.patient_id = c.patient_source_id)
select * from t2;
-- 3. prescribing
-- 注释:数量单位未找到,住院没有用药天数
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
patient_source_id as patient_id,
specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.outpatient_record
where date_of_visiting >= to_date('20210101', 'yyyyMMdd') and date_of_visiting <= to_date('20241231', 'yyyyMMdd')
union all
select distinct
inpatient_source_no as visit_id,
patient_source_id as patient_id,
admi_specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.inpat_record
where admi_time >= to_date('20210101', 'yyyyMMdd') and admi_time <= to_date('20241231', 'yyyyMMdd')
),
t2 as
(select distinct b.patient_id,c.visit_id,c.specialty,c.org_code from iceberg.cdm.patient_diagnosis a
join
t1 b on a.org_code = b.provider_id and a.visit_source_no = b.visit_id
join
t1 c on b.org_code = c.provider_id and b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%"
),
t3 as (
select distinct
a.patient_id,
a.visit_id,
'门诊' as patient_type,
a.specialty,
b.drug_name as rx_desc,
b.prescription_time as order_datetime,
null as rx_start_datetime,
null as rx_end_datetime,
b.dose as dosage_qty,
b.dose_unit_name as dosage_unit,
b.frequency_name as frequency,
b.qty as quantity,
-- quantity_uom -- 数量单位未找到
b.specs as drug_spec,
b.day_num as days_supply,
a.org_code as provider_id
from t2 a
join
iceberg.cdm.outpat_recipe_detail b on a.org_code = b.org_code and a.visit_id = b.outpatient_source_no
),
t4 as (
select distinct
a.patient_id,
a.visit_id,
'住院' as patient_type,
b.specialty_name as specialty,
b.drug_name as rx_desc,
b.input_time as order_datetime,
b.begin_time as rx_start_datetime,
b.end_time as rx_end_datetime,
b.dose as dosage_qty,
b.dose_unit_name as dosage_unit,
b.frequency_name as frequency,
b.qty as quantity,
-- quantity_uom -- 数量单位未找到
b.specs as drug_spec,
null as as days_supply, -- 住院医嘱没有此列数据 或者用两个日期相减得到天数
a.org_code as provider_id
from t2 a
join
iceberg.cdm.inpat_drug_order b on a.org_code = b.org_code and a.visit_id = b.inpatient_source_no)
select * from t3
union all
select * from t4;
-- 3.第二版 prescribing 2025-01-06
WITH t1 AS (
SELECT DISTINCT
visit_record_id AS visit_id,
outpatient_record_id AS patient_id,
date_of_visiting AS admission_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = 'true' THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM
iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'
union all
SELECT DISTINCT
visit_record_id as visit_id,
inpat_record_id as patient_id,
admission_time as admission_datetime,
admission_specialty_name as specialty,
'住院' as patient_type
FROM
iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND discharge_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
),
t2 as
(select distinct b.patient_id,c.visit_id,c.admission_datetime,c.specialty,c.patient_type
from iceberg.cdm.patient_diagnosis a
join
t1 b on a.visit_record_id = b.visit_id
join
t1 c on b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%"),
t3 as (
select distinct
b.patient_id,
b.visit_id,
b.patient_type,
b.specialty,
a.drug_name as rx_desc,
a.prescription_time as order_datetime,
null as rx_start_datetime,
null as rx_end_datetime,
a.dose as dosage_qty,
a.dose_unit_name as dosage_unit,
a.frequency_name as frequency,
a.qty as quantity,
-- null as quantity_uom, -- 没有找到此列数据
a.route_name as roa,
a.specs as drug_spec,
a.day_num as days_supply
iceberg.cdm.outpat_recipe_detail a
join t2 b on a.visit_record_id = b.visit_id
where a.drug_name like ""
-- ~* 或者尝试是否支持正则
),
t4 as (
select distinct
b.patient_id,
b.visit_id,
b.patient_type,
b.specialty,
a.drug_name as rx_desc,
a.input_time as order_datetime,
a.begin_time as rx_start_datetime,
a.end_time as rx_end_datetime,
a.dose as dosage_qty,
a.dose_unit_name as dosage_unit,
a.frequency_name as frequency,
a.qty as quantity,
-- null as quantity_uom, -- 没有找到此列数据
a.route_name as roa,
a.specs as drug_spec,
a.end_time - a.begin_time as days_supply -- 先测试是否可计算出天数 如果过慢可提出数据标化计算
iceberg.cdm.inpat_drug_order a
join t2 b on a.visit_record_id = b.visit_id)
select * from t3
union all
select * from t4;
-- 4. diagnosis 表
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
patient_source_id as patient_id,
specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.outpatient_record
where date_of_visiting >= to_date('20210101', 'yyyyMMdd') and date_of_visiting <= to_date('20241231', 'yyyyMMdd')
union all
select distinct
inpatient_source_no as visit_id,
patient_source_id as patient_id,
admi_specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.inpat_record
where admi_time >= to_date('20210101', 'yyyyMMdd') and admi_time <= to_date('20241231', 'yyyyMMdd')
),
t2 as
(select distinct b.patient_id,c.visit_id,c.specialty,c.org_code from iceberg.cdm.patient_diagnosis a
join
t1 b on a.org_code = b.provider_id and a.visit_source_no = b.visit_id
join
t1 c on b.org_code = c.provider_id and b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%"
),
t3 as (
select distinct
a.patient_id,
a.visit_id,
b.visit_type as patient_type,
b.diagnosis_time as diagnosis_datetime,
a.specialty,
a.diagnosis_code as dx,
a.diagnosis_name as dx_desc,
a.diagnosis_code as icd10_code,
a.diagnosis_name as icd10_name,
a.org_code as provider_id
from t2 a
join
iceberg.cdm.patient_diagnosis b on a.org_code = b.org_code and a.patient_id = a.patient_id)
select * from t3;
-- 4.第二版 prescribing 2025-01-06
WITH t1 AS (
SELECT DISTINCT
visit_record_id AS visit_id,
outpatient_record_id AS patient_id,
date_of_visiting AS admission_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = 'true' THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM
iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'
union all
SELECT DISTINCT
visit_record_id as visit_id,
inpat_record_id as patient_id,
admission_time as admission_datetime,
admission_specialty_name as specialty,
'住院' as patient_type
FROM
iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND discharge_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
),
t2 as
(select distinct b.patient_id,c.visit_id,c.admission_datetime,c.specialty,c.patient_type
from iceberg.cdm.patient_diagnosis a
join
t1 b on a.visit_record_id = b.visit_id
join
t1 c on b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%"),
t3 as (
select distinct
b.patient_id,
b.visit_id,
b.patient_type,
a.diagnosis_time as diagnosis_datetime,
b.specialty,
a.diagnosis_code as dx,
a.diagnosis_name as dx_desc,
a.diagnosis_code as dicd10_code,
-- 诊断类型 diagnosis_type_name 需要确认
a.is_primary as pdx,
a.diagnosis_name as icd10_name
from iceberg.cdm.patient_diagnosis a
join t2 b on a.visit_record_id = b.visit_id
where a.diagnosis_name like "%糖尿病%"
or a.diagnosis_name like "%高血压%" ......
)
select * from t3;
-- 5. vital表
-- 需要先确定表格在映射 vital_signs_record 此表格没有数据 需要询问数据对接人
select * from iceberg.cdm.vital_signs_record LIMIT 10;
-- 6. lab_result_cm 表 2025-01-06
WITH t1 AS (
SELECT DISTINCT
visit_record_id AS visit_id,
outpatient_record_id AS patient_id,
date_of_visiting AS admission_datetime,
specialty_name AS specialty,
CASE
WHEN is_emergency = 'true' THEN '急诊'
ELSE '门诊'
END AS patient_type
FROM
iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'
union all
SELECT DISTINCT
visit_record_id as visit_id,
inpat_record_id as patient_id,
admission_time as admission_datetime,
admission_specialty_name as specialty,
'住院' as patient_type
FROM
iceberg.cdm.inpat_record
WHERE discharge_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND discharge_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
),
t2 as
(select distinct b.patient_id,c.visit_id,c.admission_datetime,c.specialty,c.patient_type
from iceberg.cdm.patient_diagnosis a
join
t1 b on a.visit_record_id = b.visit_id
join
t1 c on b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%"),
t3 as
(select distinct
b.patient_id,
b.visit_id,
b.patient_type,
a.report_name as lab_name,
c.test_item_name as lab_item_name,
null as std_lab_item_name,
a.specimen_name as specimen_source,
null as lab_order_datetime,
a.specimen_collected_time as specimen_datetime,
a.report_time as result_datetime,
c.numerical_value as result_num,
c.unit_name as result_unit,
c.reference_range as result_range,
c.text_value as result_qual,
c.critical_flag as abnormal_ind
from iceberg.cdm.lab_report a
join
t2 b on a.visit_record_id = b.visit_id
join
iceberg.cdm.lab_report_result
on a.lab_report_id = b.lab_report_id -- 或者report_source_id = result_source_id
where c.test_item_name like "%葡萄糖%"
or c.test_item_name like "%C肽%" ...... -- 正则需要先探查完成 在写模糊查询
)
select * from t3;
from data_query import *
query = "SELECT * FROM iceberg.cdm.outpatient_record WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'"
df = execute_query(query)
# 打印列名和数据类型
for column in df.columns:
print(f"Column: {column}, Type: {df[column].dtype}")
# 或者打印整个 DataFrame 的数据类型信息
print(df.dtypes)
-- 时间列数据类型查询
SELECT * FROM iceberg.cdm.outpatient_record WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31';
-- 查询 符合 的 visit 人数
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
patient_source_id as patient_id,
date_of_visiting as admission_datetime,
specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.outpatient_record
where date_of_visiting >= to_date('20210101', 'yyyyMMdd') and date_of_visiting <= to_date('20241231', 'yyyyMMdd')
union all
select distinct
inpatient_source_no as visit_id,
patient_source_id as patient_id,
admi_time as admission_datetime,
admi_specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.inpat_record
where admi_time >= to_date('20210101', 'yyyyMMdd') and admi_time <= to_date('20241231', 'yyyyMMdd')
),
t2 as
(select distinct b.patient_id,c.visit_id,c.org_code,c.admission_datetime,c.specialty from iceberg.cdm.patient_diagnosis a
join
t1 b on a.org_code = b.provider_id and a.visit_source_no = b.visit_id
join
t1 c on b.org_code = c.provider_id and b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%")
select * from t2;
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
outpatient_record_id as patient_id
from iceberg.cdm.outpatient_record
union all
select distinct
inpatient_source_no as visit_id,
inpat_record_id as patient_id
from iceberg.cdm.inpat_record
)
select * from t1;
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
outpatient_record_id as patient_id
from iceberg.cdm.outpatient_record
)
select * from t1;
-- 糖尿病诊断人次
select count(*) from iceberg.cdm.patient_diagnosis where diagnosis_name like '%糖尿病%'; -- 一千五百万
select count(distinct visit_record_id) from iceberg.cdm.patient_diagnosis where diagnosis_name like '%糖尿病%'; -- 一千五百万
visit_record_id
select count(*)
from iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'; -- 两亿六
SELECT admission_time
FROM iceberg.cdm.inpat_record
WHERE admission_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND admission_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
LIMIT 10; -- 七百多万
SELECT count(distinct a.visit_record_id)
FROM iceberg.cdm.outpatient_record a join iceberg.cdm.patient_diagnosis b
on a.visit_record_id = b.visit_record_id
WHERE a.date_of_visiting >= '2021-01-01' AND a.date_of_visiting < '2025-01-01'
AND b.diagnosis_name LIKE '%糖尿病%'; -- 九百多万
-- 可能存在问题 sql查询时间
-- 查询出数据量限制问题
-- 2025-01-07需要确认问题 ************************************************************
-- 确定各表visit_id 与patient_id 表关联
select * from iceberg.cdm.outpatient_record -- 门诊就诊
iceberg.cdm.inpat_record -- 住院就诊
iceberg.cdm.patient_diagnosis --
-- 是否支持WITH 语句 和多层WITH 语句
WITH t1 AS (SELECT diagnosis_name from iceberg.cdm.patient_diagnosis LIMIT 10)select select * from t1;
WITH t1 AS (SELECT diagnosis_name from iceberg.cdm.patient_diagnosis LIMIT 10),
WITH t2 AS (select * from t1 LIMIT 5)
select * from t2;
-- 是否支持正则 和查看 diagnosis_type_name 是患者类型 还是诊断来源或其他
SELECT diagnosis_name,diagnosis_type_name from iceberg.cdm.patient_diagnosis where diagnosis_name ~* "%糖尿病%" LIMIT 10;
-- 确定就诊门诊表 is_emergency是否急诊列内数据
SELECT is_emergency,count(*) FROM iceberg.cdm.outpatient_record group by is_emergency LIMIT 1000 ;
-- 药品两列日期相减 如果运行过慢将不处理 后续用程序 或留给分析师处理
select begin_time,end_time from iceberg.cdm.inpat_drug_order LIMIT 10;
select begin_time,end_time,end_time - begin_time as days_supply from iceberg.cdm.inpat_drug_order LIMIT 10;
-- 确定体征表 模型中表格应给是为空的 与数据对接人沟通
select * from iceberg.cdm.vital_signs_record LIMIT 10;
-- 在确认下医嘱表的数量单位是有 ‘quantity_uom’
-- 导出处数据量测试 限制条数 查看导出数据是否完整
select begin_time,end_time from iceberg.cdm.inpat_drug_order LIMIT 100000;
-- 确定最小数据提取 确定sql代码能跑通
-- 确定每张表待提取量
-- 然后代码执行全量循环提取到本地 或一年一年提取 晚上循环提取?
-- 检查数据量是否与待提取量一致
-- 在执行标化等工作 由辉哥的程序执行
-- 下面是需要提数据各表在库中的数据列
"门诊或住院" as patient_type
outpatient_record 门诊就诊表 三亿八
outpatient_record_id object 门诊记录编号
visit_record_id object 来访记录编号
outpatient_source_no object 门诊流水号
data_source_primary_key object 数据源主键
is_emergency bool 是否急诊 急诊类型不能丢失 其他表格的患者类型 可能需要关联此表来获取
specialty_standard_code object 科室标准代码
specialty_standard_name object 科室标准名称
specialty_code object 就诊科室代码
specialty_name object 就诊科室名称 specialty
date_of_visiting object 就诊日期 admission_datetime
MD_code object 接诊医生代码
MD_name object 接诊医生姓名
first_governing_time datetime64[us, UTC] 第一次治理时间
last_governing_time datetime64[us, UTC] 上次治理时间
validity object 有效性标准代码
"住院" as patient_type
inpat_record 住院就诊表 一千万
inpat_record_id object 住院记录编号
visit_record_id object 来访记录编号
inpatient_source_no object 住院流水号
number_of_hospitalizations float64 第几次住院
data_source_primary_key object 数据源主键
admission_specialty_standard_code object 入院科室标准代码
admission_specialty_standard_name object 入院科室标准名称
admission_specialty_code object 入院科室代码
admission_specialty_name object 入院标准名称 specialty
admission_time datetime64[us, UTC] 入院时间 admission_datetime
discharge_specialty_standard_code object 出院科室标准代码
discharge_specialty_standard_name object 出院科室标准名称
discharge_specialty_code object 出院科室代码
discharge_specialty_name object 出院科室名称
discharge_time datetime64[us, UTC] 出院时间
first_governing_time datetime64[us, UTC] 第一次治理时间
last_governing_time datetime64[us, UTC] 上次治理时间
科室从visit表中获取
患者类型需要查找活从visit中获取
patient_diagnosis 诊断表 四亿一
diagnosis_id object 诊断编号
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
diagnosis_type_standard_code object 诊断类型标准代码
diagnosis_type_standard_name object 诊断类型标准名称
diagnosis_type_code object 诊断类型代码
diagnosis_type_name object 诊断类型名称 可能是患者类型 需要确认
diagnosis_standard_code object 诊断标准代码
diagnosis_standard_name object 诊断标准名称
diagnosis_code object 诊断代码 dx icd10_code
diagnosis_name object 诊断名称 dx_desc icd10_name
single_diagnosis_name object 单一的诊断名称
icd_edition_standard_code object Icd版标准代码
icd_edition_standard_name object Icd版标准名称
is_primary bool 是否主要诊断
diagnosis_time datetime64[us, UTC] 诊断时间 diagnosis_datetime
first_governing_time datetime64[us, UTC] 第一次治理时间
last_governing_time datetime64[us, UTC] 上次治理时间
validity object
patient_base_info 患者表 七千万
pat_base_id object
patient_source_id object 患者编号 patient_id
data_source_primary_key object 数据源主键
organization_id object
org_standard_code object
org_standard_name object
patient_name object
gender_standard_code object
gender_standard_name object
gender_code object 性别代码
gender_name object 性别名称 sex
date_of_birth object 出生日期 birth_date
country_standard_code object
country_standard_name object
country_code object
country_name object
nation_standard_code object
nation_standard_name object
nation_code object
nation_name object
id_type_standard_code object
id_type_standard_name object
id_type_code object
id_type_name object
id_no object
outpatient_MRN object
inpatient_MRN object
MRN object
home_address object
employer object
cell_phone object
email object
contact_name object
contact_relation_standard_code object
contact_relation_standard_name object
contact_relation_code object
contact_relation_name object
contact_gender_standard_code object
contact_gender_standard_name object
contact_gender_code object
contact_gender_name object
contact_phone object
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
-- outpat_recipe_detail 门诊医嘱表 四亿四 科室 specialty 可能需要从visit表中获取
outpat_recipe_detail_id object
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
prescription_time datetime64[us, UTC] 开方时间 order_datetime
prescription_source_no object 处方原始编号
prescription_item_source_no object 处方项目原始编号
group_no object 成组序号
prescription_type_standard_code object
prescription_type_standard_name object
prescription_type_code object
prescription_type_name object
order_class_standard_code object
order_class_standard_name object
order_class_code object 处方类别代码
order_class_name object 处方类别名称
drug_code object 药品代码
drug_name object 药品名称 rx_desc
specs object 规格 drug_spec
drug_form_standard_code object
drug_form_standard_name object
drug_form_code object 剂型代码
drug_form_name object 剂型名称
unit_price object 单价
qty object 数量 quantity
frequency_standard_code object
frequency_standard_name object
frequency_code object 频次代码
frequency_name object 频次名称 frequency
dose object 单次剂量 dosage_qty
dose_unit_code object 剂量单位代码
dose_unit_name object 剂量单位名称 dosage_unit
route_standard_code object
route_standard_name object
route_code object 给药途径代码
route_name object 给药途径名称
package_num float64 草药付数
day_num float64 天数 days_supply
skin_test bool 皮试标志
skin_test_result object 皮试结果
order_state_standard_code object
order_state_standard_name object
order_state_code object 医嘱状态代码
order_state_name object 医嘱状态名称
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
null as rx_start_datetime
null as rx_end_datetime
医嘱数量单位 quantity_uom 没有找到此列
-- inpat_drug_order 住院医嘱表 一亿多 patient_type 从visit获取
inpat_drug_order_id object
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
ward_name object 病区名称
bed_name object 床位名称
specialty_standard_code object
specialty_standard_name object
specialty_code object 科室代码
specialty_name object 科室名称 specialty
order_source_id object 医嘱编号
group_no object 成组序号
order_class_standard_code object
order_class_standard_name object
order_class_code object 医嘱类别代码
order_class_name object 医嘱类别名称
order_type_standard_code object
order_type_standard_name object
order_type_code object 医嘱长临代码
order_type_name object 医嘱长临名称
drug_code object 药品代码
drug_name object 药品名称 rx_desc
specs object 规格 drug_spec
drug_form_standard_code object
drug_form_standard_name object
drug_form_code object 剂型代码
drug_form_name object 剂型名称
unit_price object 单价
qty object 数量 quantity
dose object 单次剂量 dosage_qty
dose_unit_code object 剂量单位代码
dose_unit_name object 剂量单位名称 dosage_unit
frequency_standard_code object
frequency_standard_name object
frequency_code object 频次代码
frequency_name object 频次名称 frequency
route_standard_code object
route_standard_name object
route_code object 给药途径代码
route_name object 给药途径名称
package_num float64 草药付数
executing_specialty_standard_code object
executing_specialty_standard_name object
executing_specialty_code object 执行科室代码
executing_specialty_name object 执行科室名称
skin_test object 皮试标志
skin_test_result object 皮试结果
begin_time datetime64[us, UTC] 医嘱开始时间 rx_start_datetime
end_time datetime64[us, UTC] 医嘱结束时间 rx_end_datetime
input_time datetime64[us, UTC] 开立医嘱时间 order_datetime
stop_time datetime64[us, UTC] 停止时间
order_state_standard_code object
order_state_standard_name object
order_state_code object 医嘱状态代码
order_state_name object 医嘱状态名称
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
医嘱数量单位 quantity_uom 没有找到此列
end_time - begin_time = 医嘱执行的天数 days_supply (结束时间减去开始时间)
-- lab_report 化验申请单 一亿九
lab_report_id object
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
report_source_id object 报告编号
report_type_standard_code object
report_type_standard_name object
report_type_code object 报告分类代码
report_type_name object 报告分类名称
report_name object 报告名称 lab_name
specimen_standard_code object
specimen_standard_name object
specimen_code object 样本类型代码
specimen_name object 样本类型名称 specimen_source
specimen_barcode object 样本条码号
requesting_specialty_standard_code object
requesting_specialty_standard_name object
requesting_specialty_code object 申请科室代码
requesting_specialty_name object 申请科室名称
requestor_code object 申请人代码
requestor_name object 申请人姓名
executing_specialty_standard_code object
executing_specialty_standard_name object
executing_specialty_code object 执行科室代码
executing_specialty_name object 执行科室名称
executor_code object 执行人代码
executor_name object 执行人姓名
specimen_collected_time datetime64[us, UTC] 采样时间
report_time datetime64[us, UTC] 报告时间 result_datetime
reporter_code object 报告人代码
reporter_name object 报告人姓名
verifying_time datetime64[us, UTC] 审核时间
verifier_code object 审核人代码
verifier_name object 审核人姓名
instrument_name object 仪器名称
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
-- lab_report_result 化验结果单 十九亿
lab_report_result_id object
lab_report_id object 可与申请单关联?
data_source_primary_key object 数据源主键
result_source_id object 结果编号
test_item_code object 检验项目代码
test_item_name object 检验项目名称 lab_item_name std_lab_item_name标化列)
text_value object 文本结果
text_value_show object 文本结果显示
numerical_value object 数值结果 result_num
unit_code object 单位代码
unit_name object 单位名称
normal_low object 正常低值
normal_high object 正常高值
reference_range object 参考范围
critical_low object 危急低值
critical_high object 危急高值
abnormal_flag_code object 检验异常标志代码
abnormal_flag_name object 检验异常标志说明
critical_flag object 危急值标志
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
体征表 查询返回了空应该没有数据
"SELECT table_name FROM information_schema.tables WHERE table_name LIKE '%vital%'"返回结果如下
0 vital_signs_record
1 mdm_vital_signs_item
2 vital_signs_record
3 mdm_vital_signs_item
4 vital_signs_record
5 mdm_vital_signs_item
6 vital_signs_record
7 mdm_vital_signs_item
MD_medical_record 四千万
from data_query import *
query = "select * from iceberg.cdm.visit_record LIMIT 10;"
df = execute_query(query)
# 打印列名和数据类型
for column in df.columns:
print(f"Column: {column}, Type: {df[column].dtype}")
# 或者打印整个 DataFrame 的数据类型信息
print(df.dtypes)
-- 时间列数据类型查询
SELECT * FROM iceberg.cdm.outpatient_record WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31';
-- 查询 符合 的 visit 人数
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
patient_source_id as patient_id,
date_of_visiting as admission_datetime,
specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.outpatient_record
where date_of_visiting >= to_date('20210101', 'yyyyMMdd') and date_of_visiting <= to_date('20241231', 'yyyyMMdd')
union all
select distinct
inpatient_source_no as visit_id,
patient_source_id as patient_id,
admi_time as admission_datetime,
admi_specialty_name as specialty,
org_code as provider_id
from iceberg.cdm.inpat_record
where admi_time >= to_date('20210101', 'yyyyMMdd') and admi_time <= to_date('20241231', 'yyyyMMdd')
),
t2 as
(select distinct b.patient_id,c.visit_id,c.org_code,c.admission_datetime,c.specialty from iceberg.cdm.patient_diagnosis a
join
t1 b on a.org_code = b.provider_id and a.visit_source_no = b.visit_id
join
t1 c on b.org_code = c.provider_id and b.patient_id = c.patient_id
where a.diagnosis_name like "%糖尿病%")
select * from t2;
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
outpatient_record_id as patient_id
from iceberg.cdm.outpatient_record
union all
select distinct
inpatient_source_no as visit_id,
inpat_record_id as patient_id
from iceberg.cdm.inpat_record
)
select * from t1;
WITH t1 AS (
select distinct
outpatient_source_no as visit_id,
outpatient_record_id as patient_id
from iceberg.cdm.outpatient_record
)
select * from t1;
-- 糖尿病诊断人次
select count(*) from iceberg.cdm.patient_diagnosis where diagnosis_name like '%糖尿病%'; -- 一千五百万
select count(distinct visit_record_id) from iceberg.cdm.patient_diagnosis where diagnosis_name like '%糖尿病%'; -- 一千五百万
visit_record_id
select count(*)
from iceberg.cdm.outpatient_record
WHERE CAST(date_of_visiting AS DATE) >= DATE '2021-01-01' AND CAST(date_of_visiting AS DATE) <= DATE '2024-12-31'; -- 两亿六
SELECT admission_time
FROM iceberg.cdm.inpat_record
WHERE admission_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND admission_time < TIMESTAMP '2025-01-01 00:00:00+00:00'
LIMIT 10; -- 七百多万
SELECT count(distinct a.visit_record_id)
FROM iceberg.cdm.outpatient_record a join iceberg.cdm.patient_diagnosis b
on a.visit_record_id = b.visit_record_id
WHERE a.date_of_visiting >= '2021-01-01' AND a.date_of_visiting < '2025-01-01'
AND b.diagnosis_name LIKE '%糖尿病%'; -- 九百多万
-- 可能存在问题 sql查询时间
-- 查询出数据量限制问题
select outpatient_source_no from iceberg.cdm.outpatient_record where outpatient_source_no = '320111201611243336||01'
select a.outpatient_source_no,b.patient_source_id
from iceberg.cdm.outpatient_record a
join iceberg.cdm.patient_base_info b
on a.outpatient_source_no = b.patient_source_id LIMIT 10000;
-- 2025-01-07需要确认问题 ************************************************************
-- visit_record_id 是 visit_id
-- pat_base_id 是 patient_id
-- 确定各表visit_id 与patient_id 表关联
select * from iceberg.cdm.outpatient_record LIMIT 10-- 门诊就诊
iceberg.cdm.inpat_record -- 住院就诊
iceberg.cdm.patient_diagnosis --
-- 选取一个机构确定数据量多的
select organization_id,count(*) from iceberg.cdm.patient_base_info group by organization_id
organization_id count(*)
0 320106426090445 34292995
1 320104466002630 34071579
2 320106466000838 31353368
24 320118426080415 715027
25 32011442602056X 735720
26 320114426020535 536525
-- 是否支持WITH 语句 和多层WITH 语句 支持
WITH t1 AS (SELECT diagnosis_name from iceberg.cdm.patient_diagnosis LIMIT 10) select * from t1;
WITH t1 AS (
SELECT diagnosis_name
FROM iceberg.cdm.patient_diagnosis
LIMIT 10
),
t2 AS (
SELECT * FROM t1
LIMIT 5
),
t3 AS (
SELECT * FROM t2 LIMIT 3)
SELECT * FROM t3
;
-- 是否支持正则 和查看 diagnosis_type_name 是患者类型 ‘门急诊诊断 出院诊断 主治医师诊断 步诊断’ 还是诊断来源或其他
-- 支持正则 !~* ~* 注: sql内where之后只能用单引号
SELECT diagnosis_name,diagnosis_type_name from iceberg.cdm.patient_diagnosis where diagnosis_name ~* '糖尿病' LIMIT 10;
-- 确定就诊门诊表 is_emergency是否急诊列内数据 内部数据如下:
0 False 354293095
1 True 31448863
SELECT is_emergency,count(*) FROM iceberg.cdm.outpatient_record group by is_emergency LIMIT 1000 ;
-- 药品两列日期相减 如果运行过慢将不处理 后续用程序 或留给分析师处理
-- 可直接相减 但有很多错误医嘱结束时间 ‘end_time’
select begin_time,end_time from iceberg.cdm.inpat_drug_order LIMIT 10;
select begin_time,end_time,end_time - begin_time as days_supply from iceberg.cdm.inpat_drug_order LIMIT 10;
-- 确定体征表 模型中表格应给是为空的 与数据对接人沟通
-- 已经沟通 2025-01-07 11:30 确定库里无数据
select * from iceberg.cdm.vital_signs_record LIMIT 10;
-- 在确认下医嘱表的数量单位是有 ‘quantity_uom’ 已经确认 无此列数据
-- 导出处数据量测试 限制条数 查看导出数据是否完整 不完整 已经确认
select begin_time,end_time from iceberg.cdm.inpat_drug_order LIMIT 100000;
-- 化验数据探查
select test_item_name,count(*) from iceberg.cdm.lab_report_result
where test_item_name ~* 'C肽|C-PR'
and test_item_name ~* '空腹|餐后|分钟|小时'group by test_item_name order by count(*) LIMIT 100;
select test_item_name,count(*) from iceberg.cdm.lab_report_result
where test_item_name ~* '空腹|FPG|空腹血糖'
and test_item_name ~* '血' group by test_item_name order by count(*) LIMIT 100;
select test_item_name,count(*) from iceberg.cdm.lab_report_result
where
test_item_name ~* 'OGTT|耐量|负荷'
and test_item_name ~* '2|120'
group by test_item_name order by count(*) LIMIT 100;
-- 确定最小数据提取 确定sql代码能跑通
-- 确定每张表待提取量
-- 然后代码执行全量循环提取到本地 或一年一年提取 晚上循环提取?
-- 检查数据量是否与待提取量一致
-- 在执行标化等工作 由辉哥的程序执行
-- visit_record 396901676
Column: validity, Type: object
visit_record_id object visit_id
pat_base_id object patient_id
organization_id object 组织id
org_standard_code object
org_standard_name object
patient_type_standard_code object
patient_type_standard_name object
data_source_primary_key object
source_no object
card_no object
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity
outpatient_record_id visit_record_id 内数据一致
-- 下面是需要提数据各表在库中的数据列
"门诊或住院" as patient_type
outpatient_record 门诊就诊表
outpatient_record_id object 门诊记录编号
visit_record_id object 来访记录编号
outpatient_source_no object 门诊流水号
data_source_primary_key object 数据源主键
is_emergency bool 是否急诊 急诊类型不能丢失 其他表格的患者类型 可能需要关联此表来获取
specialty_standard_code object 科室标准代码
specialty_standard_name object 科室标准名称
specialty_code object 就诊科室代码
specialty_name object 就诊科室名称 specialty
date_of_visiting object 就诊日期 admission_datetime
MD_code object 接诊医生代码
MD_name object 接诊医生姓名
first_governing_time datetime64[us, UTC] 第一次治理时间
last_governing_time datetime64[us, UTC] 上次治理时间
validity object 有效性标准代码
"住院" as patient_type
inpat_record 住院就诊表
inpat_record_id object 住院记录编号
visit_record_id object 来访记录编号
inpatient_source_no object 住院流水号
number_of_hospitalizations float64 第几次住院
data_source_primary_key object 数据源主键
admission_specialty_standard_code object 入院科室标准代码
admission_specialty_standard_name object 入院科室标准名称
admission_specialty_code object 入院科室代码
admission_specialty_name object 入院标准名称 specialty
admission_time datetime64[us, UTC] 入院时间 admission_datetime
discharge_specialty_standard_code object 出院科室标准代码
discharge_specialty_standard_name object 出院科室标准名称
discharge_specialty_code object 出院科室代码
discharge_specialty_name object 出院科室名称
discharge_time datetime64[us, UTC] 出院时间
first_governing_time datetime64[us, UTC] 第一次治理时间
last_governing_time datetime64[us, UTC] 上次治理时间
科室从visit表中获取
患者类型需要查找活从visit中获取
patient_diagnosis 诊断表
diagnosis_id object 诊断编号
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
diagnosis_type_standard_code object 诊断类型标准代码
diagnosis_type_standard_name object 诊断类型标准名称
diagnosis_type_code object 诊断类型代码
diagnosis_type_name object 诊断类型名称 可能是患者类型 需要确认
diagnosis_standard_code object 诊断标准代码
diagnosis_standard_name object 诊断标准名称
diagnosis_code object 诊断代码 dx icd10_code
diagnosis_name object 诊断名称 dx_desc icd10_name
single_diagnosis_name object 单一的诊断名称
icd_edition_standard_code object Icd版标准代码
icd_edition_standard_name object Icd版标准名称
is_primary bool 是否主要诊断
diagnosis_time datetime64[us, UTC] 诊断时间 diagnosis_datetime
first_governing_time datetime64[us, UTC] 第一次治理时间
last_governing_time datetime64[us, UTC] 上次治理时间
validity object
patient_base_info 患者表
pat_base_id object patient_id
patient_source_id object 患者编号 patient_id
data_source_primary_key object 数据源主键
organization_id object 组织id
org_standard_code object
org_standard_name object
patient_name object
gender_standard_code object
gender_standard_name object
gender_code object 性别代码
gender_name object 性别名称 sex
date_of_birth object 出生日期 birth_date
country_standard_code object
country_standard_name object
country_code object
country_name object
nation_standard_code object
nation_standard_name object
nation_code object
nation_name object
id_type_standard_code object
id_type_standard_name object
id_type_code object
id_type_name object
id_no object
outpatient_MRN object
inpatient_MRN object
MRN object
home_address object
employer object
cell_phone object
email object
contact_name object
contact_relation_standard_code object
contact_relation_standard_name object
contact_relation_code object
contact_relation_name object
contact_gender_standard_code object
contact_gender_standard_name object
contact_gender_code object
contact_gender_name object
contact_phone object
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
-- outpat_recipe_detail 门诊医嘱表 科室 specialty 可能需要从visit表中获取
outpat_recipe_detail_id object
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
prescription_time datetime64[us, UTC] 开方时间 order_datetime
prescription_source_no object 处方原始编号
prescription_item_source_no object 处方项目原始编号
group_no object 成组序号
prescription_type_standard_code object
prescription_type_standard_name object
prescription_type_code object
prescription_type_name object
order_class_standard_code object
order_class_standard_name object
order_class_code object 处方类别代码
order_class_name object 处方类别名称
drug_code object 药品代码
drug_name object 药品名称 rx_desc
specs object 规格 drug_spec
drug_form_standard_code object
drug_form_standard_name object
drug_form_code object 剂型代码
drug_form_name object 剂型名称
unit_price object 单价
qty object 数量 quantity
frequency_standard_code object
frequency_standard_name object
frequency_code object 频次代码
frequency_name object 频次名称 frequency
dose object 单次剂量 dosage_qty
dose_unit_code object 剂量单位代码
dose_unit_name object 剂量单位名称 dosage_unit
route_standard_code object
route_standard_name object
route_code object 给药途径代码
route_name object 给药途径名称
package_num float64 草药付数
day_num float64 天数 days_supply
skin_test bool 皮试标志
skin_test_result object 皮试结果
order_state_standard_code object
order_state_standard_name object
order_state_code object 医嘱状态代码
order_state_name object 医嘱状态名称
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
null as rx_start_datetime
null as rx_end_datetime
医嘱数量单位 quantity_uom 没有找到此列
-- inpat_drug_order 住院医嘱表 patient_type 从visit获取
inpat_drug_order_id object
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
ward_name object 病区名称
bed_name object 床位名称
specialty_standard_code object
specialty_standard_name object
specialty_code object 科室代码
specialty_name object 科室名称 specialty
order_source_id object 医嘱编号
group_no object 成组序号
order_class_standard_code object
order_class_standard_name object
order_class_code object 医嘱类别代码
order_class_name object 医嘱类别名称
order_type_standard_code object
order_type_standard_name object
order_type_code object 医嘱长临代码
order_type_name object 医嘱长临名称
drug_code object 药品代码
drug_name object 药品名称 rx_desc
specs object 规格 drug_spec
drug_form_standard_code object
drug_form_standard_name object
drug_form_code object 剂型代码
drug_form_name object 剂型名称
unit_price object 单价
qty object 数量 quantity
dose object 单次剂量 dosage_qty
dose_unit_code object 剂量单位代码
dose_unit_name object 剂量单位名称 dosage_unit
frequency_standard_code object
frequency_standard_name object
frequency_code object 频次代码
frequency_name object 频次名称 frequency
route_standard_code object
route_standard_name object
route_code object 给药途径代码
route_name object 给药途径名称
package_num float64 草药付数
executing_specialty_standard_code object
executing_specialty_standard_name object
executing_specialty_code object 执行科室代码
executing_specialty_name object 执行科室名称
skin_test object 皮试标志
skin_test_result object 皮试结果
begin_time datetime64[us, UTC] 医嘱开始时间 rx_start_datetime
end_time datetime64[us, UTC] 医嘱结束时间 rx_end_datetime
input_time datetime64[us, UTC] 开立医嘱时间 order_datetime
stop_time datetime64[us, UTC] 停止时间
order_state_standard_code object
order_state_standard_name object
order_state_code object 医嘱状态代码
order_state_name object 医嘱状态名称
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
医嘱数量单位 quantity_uom 没有找到此列
end_time - begin_time = 医嘱执行的天数 days_supply (结束时间减去开始时间)
-- lab_report 化验申请单
lab_report_id object
visit_record_id object 来访记录编号
data_source_primary_key object 数据源主键
report_source_id object 报告编号
report_type_standard_code object
report_type_standard_name object
report_type_code object 报告分类代码
report_type_name object 报告分类名称
report_name object 报告名称 lab_name
specimen_standard_code object
specimen_standard_name object
specimen_code object 样本类型代码
specimen_name object 样本类型名称 specimen_source
specimen_barcode object 样本条码号
requesting_specialty_standard_code object
requesting_specialty_standard_name object
requesting_specialty_code object 申请科室代码
requesting_specialty_name object 申请科室名称
requestor_code object 申请人代码
requestor_name object 申请人姓名
executing_specialty_standard_code object
executing_specialty_standard_name object
executing_specialty_code object 执行科室代码
executing_specialty_name object 执行科室名称
executor_code object 执行人代码
executor_name object 执行人姓名
specimen_collected_time datetime64[us, UTC] 采样时间
report_time datetime64[us, UTC] 报告时间 result_datetime
reporter_code object 报告人代码
reporter_name object 报告人姓名
verifying_time datetime64[us, UTC] 审核时间
verifier_code object 审核人代码
verifier_name object 审核人姓名
instrument_name object 仪器名称
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
t7 AS (
select DISTINCT * from iceberg.cdm.lab_report_result
where test_item_name ~* 'C肽|C-PR' and test_item_name ~* '空腹|1|60|2|120|3|180'
),
t8 AS (select *
from iceberg.cdm.lab_report
where report_time >= TIMESTAMP '2021-01-01 00:00:00+00:00'
AND report_time < TIMESTAMP '2022-02-01 00:00:00+00:00')
select a.result_source_id,b.report_source_id from t7 a join t8 b on a.result_source_id = b.report_source_id)
-- lab_report_result 化验结果单
lab_report_result_id object
lab_report_id object 可与申请单关联?
data_source_primary_key object 数据源主键
result_source_id object 结果编号
test_item_code object 检验项目代码
test_item_name object 检验项目名称 lab_item_name std_lab_item_name标化列)
text_value object 文本结果
text_value_show object 文本结果显示
numerical_value object 数值结果 result_num
unit_code object 单位代码
unit_name object 单位名称
normal_low object 正常低值
normal_high object 正常高值
reference_range object 参考范围
critical_low object 危急低值
critical_high object 危急高值
abnormal_flag_code object 检验异常标志代码
abnormal_flag_name object 检验异常标志说明
critical_flag object 危急值标志
first_governing_time datetime64[us, UTC]
last_governing_time datetime64[us, UTC]
validity object
体征表 查询返回了空应该没有数据
"SELECT table_name FROM information_schema.tables WHERE table_name LIKE '%vital%'"返回结果如下
0 vital_signs_record
1 mdm_vital_signs_item
2 vital_signs_record
3 mdm_vital_signs_item
4 vital_signs_record
5 mdm_vital_signs_item
6 vital_signs_record
7 mdm_vital_signs_item
MD_medical_record
四年有糖尿病患者的人数 1691600
执行时间
开始执行 ETL 任务...
/usr/local/lib/python3.10/site-packages/adbc_driver_manager/dbapi.py:307: Warning: Cannot disable autocommit; conn will not be DB-API 2.0 compliant
warnings.warn(
查询 1 的行数: 10369
成功写入查询 1 的结果到 ./test_patient.csv,行数: 10369
查询 2 的行数: 4095
成功写入查询 2 的结果到 ./test_patient.csv,行数: 4095
查询 1 的行数: 11452
成功写入查询 1 的结果到 ./test_visit.csv,行数: 11452
查询 2 的行数: 5923
成功写入查询 2 的结果到 ./test_visit.csv,行数: 5923
查询 1 的行数: 212
成功写入查询 1 的结果到 ./test_lab_result_cm.csv,行数: 212
查询 2 的行数: 3
成功写入查询 2 的结果到 ./test_lab_result_cm.csv,行数: 3
查询 1 的行数: 11285
成功写入查询 1 的结果到 ./test_prescribing.csv,行数: 11285
查询 2 的行数: 20
成功写入查询 2 的结果到 ./test_prescribing.csv,行数: 20
查询 1 的行数: 10561
成功写入查询 1 的结果到 ./test_diagnosis.csv,行数: 10561
查询 2 的行数: 3432
成功写入查询 2 的结果到 ./test_diagnosis.csv,行数: 3432
所有 ETL 任务共耗时: 804.12
\ No newline at end of file
关于 2, 3 部分 南京内部是以SQL查询服务的方式提供, Schema有文档描述, 访问接口是 Arrow Flight SQL
Flight SQL 有不同语言的Client 实现
https://arrow.apache.org/docs/format/FlightSql.html
https://pypi.org/project/adbc-driver-flightsql/ (python)
https://arrow.apache.org/docs/java/flight_sql_jdbc_driver.html (java)
https://pkg.go.dev/github.com/apache/arrow/go/v12/arrow/flight/flightsql (golang)
-江苏省人民医院 用药化验 50 化验没有提交
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论