提交 7c09887c authored 作者: 郑德辉's avatar 郑德辉

提交代码

上级 0e41030b
.idea
venv
*.xlsx
\ No newline at end of file
import pandas as pd
import time
def process_data(datafile, target_file, column_name):
start = time.time()
pd.set_option('mode.chained_assignment', None) # Disable warning
# Read the Excel file
df = pd.read_excel(datafile)
# Create an Excel writer object using pandas
with pd.ExcelWriter(target_file, engine='xlsxwriter') as writer:
# Write original data to the first sheet
df.to_excel(writer, sheet_name='Original Data', index=False)
# Filter out rows where 'column_name' is null
df = df[df[column_name].notnull()]
# Extract unique diseases
diseases = set(df[column_name].tolist())
# Initialize a list to hold unique sheet names
sheet_unique_list = []
# Process diseases to split by commas and add unique items to the list
for disease in diseases:
items = disease.strip(',').split(',')
for item in items:
if item not in sheet_unique_list:
sheet_unique_list.append(item)
# Sort 'clear_sheets' in-place
sheet_unique_list.sort(key=len, reverse=False)
# print(len(sheet_unique_list))
for sheet_name in sheet_unique_list:
# Filter dataframe for current sheet name
mask = df[column_name].str.contains(sheet_name, na=False)
df_temp = df[mask]
df_temp[column_name] = sheet_name
# Write to sheet
df_temp.to_excel(writer, sheet_name=sheet_name, index=False)
end = time.time()
runtime = end - start
print('*************程序运行时间为:', runtime, '秒')
# Usage
# datafile = r'./诊断标化_第四版.xlsx' # Input file path
# target_file = r'./诊断标化_第四版_诊断拆分.xlsx' # Output file path
# column_name = 'std_dx_desc' # Column name
# process_data(datafile, target_file, column_name)
datafile = r'./药品标化第三版.xlsx' # Input file path
target_file = r'./药品标化第三版_药品拆分.xlsx' # Output file path
column_name = 'std_rx_desc' # Column name
process_data(datafile, target_file, column_name)
Markdown 格式
0%
您添加了 0 到此讨论。请谨慎行事。
请先完成此评论的编辑!
注册 或者 后发表评论