From 4909705a2d7f44d0adfce65f138d76f89e23dee7 Mon Sep 17 00:00:00 2001 From: admin <572701190@qq.com> Date: Tue, 19 May 2026 17:41:24 +0800 Subject: [PATCH] Tighten lab item routing and dedupe rows --- app/processor.py | 20 ++++ .../V1-ALL_convert_Lab_Test_data.py | 3 +- ...V2-Every_Pat_File_convert_Lab_Test_data.py | 2 +- app/processors/dynamic_router.py | 98 +++++++++++++++---- tests/verify_dynamic_routing.py | 28 ++++++ 5 files changed, 127 insertions(+), 24 deletions(-) diff --git a/app/processor.py b/app/processor.py index aa59f13..6eae18e 100644 --- a/app/processor.py +++ b/app/processor.py @@ -464,6 +464,7 @@ def _postprocess_workbook( workbook = load_workbook(path) try: _normalize_unmatched_columns(workbook) + _deduplicate_regular_rows(workbook) summary_records = _collect_summary_records(workbook) _remove_not_found_rows(workbook) _remove_empty_unmatched_columns(workbook) @@ -555,6 +556,25 @@ def _remove_not_found_rows(workbook) -> None: sheet.delete_rows(row_index, 1) +def _deduplicate_regular_rows(workbook) -> None: + for sheet in workbook.worksheets: + if sheet.title == SUMMARY_SHEET_NAME or sheet.max_row < 3: + continue + + seen = set() + for row_index in range(sheet.max_row, 1, -1): + values = tuple( + _cell_text(sheet.cell(row_index, col).value) + for col in range(1, sheet.max_column + 1) + ) + if not any(values): + continue + if values in seen: + sheet.delete_rows(row_index, 1) + else: + seen.add(values) + + def _collect_summary_records(workbook) -> list[dict[str, object]]: records: list[dict[str, object]] = [] for sheet in workbook.worksheets: diff --git a/app/processors/V1-ALL_convert_Lab_Test_data.py b/app/processors/V1-ALL_convert_Lab_Test_data.py index 4506918..f0159a1 100644 --- a/app/processors/V1-ALL_convert_Lab_Test_data.py +++ b/app/processors/V1-ALL_convert_Lab_Test_data.py @@ -271,7 +271,7 @@ for pat_no in pat_no_col: with open(row_file_path, "r", encoding='utf-8-sig') as row_file: detail_rows = list(csv.DictReader(row_file)) - sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests) + sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests, row_1.get("req_reason", "")) append_routed_report( add_content_to_excel, result_save_pth, @@ -286,4 +286,3 @@ for pat_no in pat_no_col: # 每处理一个患者数据,保存相关信息 save_excel() - diff --git a/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py b/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py index 6d57844..8fab01e 100644 --- a/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py +++ b/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py @@ -236,7 +236,7 @@ for pat_no in os.listdir(file_dir): # 遍历 file_dir 下的所有文件和文 with open(row_file_path, "r", encoding="utf-8-sig") as row_file: detail_rows = list(csv.DictReader(row_file)) - sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests) + sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests, row_1.get("req_reason", "")) append_routed_report( add_content_to_excel, result_save_pth, diff --git a/app/processors/dynamic_router.py b/app/processors/dynamic_router.py index 76cbe0f..f54a9af 100644 --- a/app/processors/dynamic_router.py +++ b/app/processors/dynamic_router.py @@ -4,6 +4,24 @@ import re UNMATCHED_HEADER = "未匹配检测内容" UNASSIGNED_SHEET_NAME = "未归属检测内容" +CATEGORY_REASON_KEYWORDS = { + "血细胞": ["血细胞", "血常规"], + "凝血": ["凝血"], + "肝功": ["肝功", "肾功", "电解质", "葡萄糖", "心肌酶"], + "各类肿瘤标志物": ["肿瘤", "标志物", "癌胚", "甲状旁腺", "降钙素", "鳞状细胞"], + "七抗": ["七抗", "自身抗体"], + "传染指标": ["传染"], + "血气分析+生化分析": ["血气"], + "感染指标": ["感染", "新冠", "冠状病毒", "结核", "细菌", "病毒", "HPV", "C反应蛋白", "降钙素原"], + "基因检测指标": ["基因", "CYP"], + "心衰系列": ["心衰", "B型", "BNP", "Pro-BNP", "钠尿肽", "肌钙蛋白"], + "普通指标": ["血型", "隐血", "卡式"], + "免疫系列": ["甲功", "甲状腺", "促甲状腺", "抗甲状腺"], + "特殊指标": ["细胞因子", "白介素", "血管内皮"], + "内分泌代谢系列": ["内分泌", "代谢", "儿茶酚胺", "ANCA"], + "用药指导": ["用药", "VKORC", "CYP2C9", "ALDH2", "ApoE", "SLCO"], +} + def match_re(value, pattern): return re.match(str(pattern), str(value or "")) is not None @@ -23,37 +41,75 @@ def detail_value(row): return "" -def route_detail_rows(detail_rows, all_tests): +def route_detail_rows(detail_rows, all_tests, reason=""): sheet_results = {} unassigned_items = [] + tests_by_name = {test["test_check_name"]: test for test in all_tests} + candidates_by_row = [] + candidate_counts = {} for detail_row in detail_rows: + candidates = _match_candidates(detail_row, all_tests) + candidates_by_row.append((detail_row, candidates)) + for candidate in candidates: + candidate_counts[candidate["sheet_name"]] = candidate_counts.get(candidate["sheet_name"], 0) + 1 + + for detail_row, candidates in candidates_by_row: item_name = detail_row.get("rpt_itemname", "") - matched_any = False - - for test in all_tests: - test_result_col_name = test["test_result_col_name"] - test_check_list = test["test_check_list"] - test_check_list_all = test["test_check_list_all"] - - for index, checks in enumerate(test_check_list_all): - if isinstance(checks, str): - checks = [checks] - if any(match_re(item_name, pattern) for pattern in checks): - sheet_name = test["test_check_name"] - result_name = test_check_list[index] - sheet_results.setdefault(sheet_name, {})[result_name] = clean_result( - detail_row.get(test_result_col_name, "") - ) - matched_any = True - break - - if not matched_any: + if not candidates: unassigned_items.append(f"{item_name}:{detail_value(detail_row)}") + continue + + candidate = _choose_candidate(candidates, reason, candidate_counts) + test = tests_by_name[candidate["sheet_name"]] + sheet_results.setdefault(candidate["sheet_name"], {})[candidate["result_name"]] = clean_result( + detail_row.get(test["test_result_col_name"], "") + ) return sheet_results, unassigned_items +def _match_candidates(detail_row, all_tests): + item_name = detail_row.get("rpt_itemname", "") + candidates = [] + + for test_index, test in enumerate(all_tests): + test_check_list = test["test_check_list"] + test_check_list_all = test["test_check_list_all"] + + for item_index, checks in enumerate(test_check_list_all): + if isinstance(checks, str): + checks = [checks] + if any(match_re(item_name, pattern) for pattern in checks): + candidates.append( + { + "sheet_name": test["test_check_name"], + "result_name": test_check_list[item_index], + "test_index": test_index, + "item_index": item_index, + } + ) + break + + return candidates + + +def _choose_candidate(candidates, reason, candidate_counts): + if len(candidates) == 1: + return candidates[0] + + reason = str(reason or "") + + def score(candidate): + sheet_name = candidate["sheet_name"] + keywords = CATEGORY_REASON_KEYWORDS.get(sheet_name, []) + reason_score = 100 if any(keyword and keyword in reason for keyword in keywords) else 0 + density_score = candidate_counts.get(sheet_name, 0) + return (reason_score, density_score, -candidate["test_index"], -candidate["item_index"]) + + return max(candidates, key=score) + + def append_routed_report( add_content_to_excel, result_save_path, diff --git a/tests/verify_dynamic_routing.py b/tests/verify_dynamic_routing.py index 8b37bad..afb3720 100644 --- a/tests/verify_dynamic_routing.py +++ b/tests/verify_dynamic_routing.py @@ -79,6 +79,22 @@ def build_fixture(root): "req_reason": "未知组合检测", "sampled_dt": "2026-01-01 08:10:00", }, + { + "rptunitid": "161", + "rechkdt": "2026-01-01 10:15:00", + "reportid": "20260101-161-4", + "reporttype": "10", + "req_reason": "甲功七项(化学发光法)[复]", + "sampled_dt": "2026-01-01 08:15:00", + }, + { + "rptunitid": "161", + "rechkdt": "2026-01-01 10:15:00", + "reportid": "20260101-161-4", + "reporttype": "10", + "req_reason": "甲功七项(化学发光法)[复]", + "sampled_dt": "2026-01-01 08:15:00", + }, ] for row in reports: for header in REPORT_HEADERS: @@ -90,6 +106,12 @@ def build_fixture(root): [{"reportid": reports[0]["reportid"], "rpt_itemname": "谷草转氨酶", "result_str": "21"}], [{"reportid": reports[1]["reportid"], "rpt_itemname": "B型前脑尿钠肽", "result_str": "57.20"}], [{"reportid": reports[2]["reportid"], "rpt_itemname": "神秘检测项目", "result_str": "42"}], + [ + {"reportid": reports[3]["reportid"], "rpt_itemname": "甲状腺球蛋白", "result_str": "6.02"}, + ], + [ + {"reportid": reports[4]["reportid"], "rpt_itemname": "甲状腺球蛋白", "result_str": "6.02"}, + ], ] detail_dir = root / "Tests_Detail_List" / patient_id for report, rows in zip(reports, details): @@ -114,10 +136,13 @@ def assert_dynamic_routing(workbook_path): try: liver_rows = rows_for(workbook["肝功"]) heart_rows = rows_for(workbook["心衰系列"]) + immune_rows = rows_for(workbook["免疫系列"]) summary_text = "\n".join("\t".join(row) for row in rows_for(workbook["未检测到内容汇总"])) liver_data = [row for row in liver_rows[1:] if row and row[0]] heart_data = [row for row in heart_rows[1:] if row and row[0]] + tumor_data = [row for row in rows_for(workbook["各类肿瘤标志物"])[1:] if row and row[0]] + immune_data = [row for row in immune_rows[1:] if row and row[0]] assert len(liver_data) == 1, liver_data assert liver_data[0][3] == "肝功十项[复]_电解质五项[复]_肾功三项[复]" @@ -132,6 +157,9 @@ def assert_dynamic_routing(workbook_path): assert "神秘检测项目" in summary_text assert "谷草转氨酶" not in summary_text assert "B型前脑尿钠肽" not in summary_text + assert len(tumor_data) == 0, tumor_data + assert len(immune_data) == 1, immune_data + assert immune_data[0][3] == "甲功七项(化学发光法)[复]" finally: workbook.close()