提交代码

7c09887c · 郑德辉 · 0e41030b · 7c09887c · 7c09887c
--- a/.gitignore
+++ b/.gitignore
+
+.idea
+venv
+*.xlsx
\ No newline at end of file
--- a/one_to_more_sheet.py
+++ b/one_to_more_sheet.py
+import pandas as pd
+import time
+
+def process_data(datafile, target_file, column_name):
+    start = time.time()
+
+    pd.set_option('mode.chained_assignment', None) # Disable warning
+
+    # Read the Excel file
+    df = pd.read_excel(datafile)
+
+    # Create an Excel writer object using pandas
+    with pd.ExcelWriter(target_file, engine='xlsxwriter') as writer:
+
+        # Write original data to the first sheet
+        df.to_excel(writer, sheet_name='Original Data', index=False)
+
+        # Filter out rows where 'column_name' is null
+        df = df[df[column_name].notnull()]
+
+        # Extract unique diseases
+        diseases = set(df[column_name].tolist())
+
+        # Initialize a list to hold unique sheet names
+        sheet_unique_list = []
+
+        # Process diseases to split by commas and add unique items to the list
+        for disease in diseases:
+            items = disease.strip(',').split(',')
+            for item in items:
+                if item not in sheet_unique_list:
+                    sheet_unique_list.append(item)
+
+        # Sort 'clear_sheets' in-place
+        sheet_unique_list.sort(key=len, reverse=False)
+        # print(len(sheet_unique_list))
+        for sheet_name in sheet_unique_list:
+            # Filter dataframe for current sheet name
+            mask = df[column_name].str.contains(sheet_name, na=False)
+            df_temp = df[mask]
+            df_temp[column_name] = sheet_name
+
+            # Write to sheet
+            df_temp.to_excel(writer, sheet_name=sheet_name, index=False)
+
+    end = time.time()
+
+    runtime = end - start
+    print('*************程序运行时间为:', runtime, '秒')
+
+# Usage
+# datafile = r'./诊断标化_第四版.xlsx'  # Input file path
+# target_file = r'./诊断标化_第四版_诊断拆分.xlsx'  # Output file path
+# column_name = 'std_dx_desc'  # Column name
+# process_data(datafile, target_file, column_name)
+
+datafile = r'./药品标化第三版.xlsx'  # Input file path
+target_file = r'./药品标化第三版_药品拆分.xlsx'  # Output file path
+column_name = 'std_rx_desc'  # Column name
+process_data(datafile, target_file, column_name)