From 4909705a2d7f44d0adfce65f138d76f89e23dee7 Mon Sep 17 00:00:00 2001
From: admin <572701190@qq.com>
Date: Tue, 19 May 2026 17:41:24 +0800
Subject: [PATCH] Tighten lab item routing and dedupe rows

---
 app/processor.py                              | 20 ++++
 .../V1-ALL_convert_Lab_Test_data.py           |  3 +-
 ...V2-Every_Pat_File_convert_Lab_Test_data.py |  2 +-
 app/processors/dynamic_router.py              | 98 +++++++++++++++----
 tests/verify_dynamic_routing.py               | 28 ++++++
 5 files changed, 127 insertions(+), 24 deletions(-)

diff --git a/app/processor.py b/app/processor.py
index aa59f13..6eae18e 100644
--- a/app/processor.py
+++ b/app/processor.py
@@ -464,6 +464,7 @@ def _postprocess_workbook(
     workbook = load_workbook(path)
     try:
         _normalize_unmatched_columns(workbook)
+        _deduplicate_regular_rows(workbook)
         summary_records = _collect_summary_records(workbook)
         _remove_not_found_rows(workbook)
         _remove_empty_unmatched_columns(workbook)
@@ -555,6 +556,25 @@ def _remove_not_found_rows(workbook) -> None:
                 sheet.delete_rows(row_index, 1)
 
 
+def _deduplicate_regular_rows(workbook) -> None:
+    for sheet in workbook.worksheets:
+        if sheet.title == SUMMARY_SHEET_NAME or sheet.max_row < 3:
+            continue
+
+        seen = set()
+        for row_index in range(sheet.max_row, 1, -1):
+            values = tuple(
+                _cell_text(sheet.cell(row_index, col).value)
+                for col in range(1, sheet.max_column + 1)
+            )
+            if not any(values):
+                continue
+            if values in seen:
+                sheet.delete_rows(row_index, 1)
+            else:
+                seen.add(values)
+
+
 def _collect_summary_records(workbook) -> list[dict[str, object]]:
     records: list[dict[str, object]] = []
     for sheet in workbook.worksheets:
diff --git a/app/processors/V1-ALL_convert_Lab_Test_data.py b/app/processors/V1-ALL_convert_Lab_Test_data.py
index 4506918..f0159a1 100644
--- a/app/processors/V1-ALL_convert_Lab_Test_data.py
+++ b/app/processors/V1-ALL_convert_Lab_Test_data.py
@@ -271,7 +271,7 @@ for pat_no in pat_no_col:
         with open(row_file_path, "r", encoding='utf-8-sig') as row_file:
             detail_rows = list(csv.DictReader(row_file))
 
-        sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests)
+        sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests, row_1.get("req_reason", ""))
         append_routed_report(
             add_content_to_excel,
             result_save_pth,
@@ -286,4 +286,3 @@ for pat_no in pat_no_col:
     # 每处理一个患者数据，保存相关信息
 save_excel()                
      
-
diff --git a/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py b/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py
index 6d57844..8fab01e 100644
--- a/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py
+++ b/app/processors/V2-Every_Pat_File_convert_Lab_Test_data.py
@@ -236,7 +236,7 @@ for pat_no in os.listdir(file_dir): # 遍历 file_dir 下的所有文件和文
         with open(row_file_path, "r", encoding="utf-8-sig") as row_file:
             detail_rows = list(csv.DictReader(row_file))
 
-        sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests)
+        sheet_results, unassigned_items = route_detail_rows(detail_rows, ALL_tests, row_1.get("req_reason", ""))
         append_routed_report(
             add_content_to_excel,
             result_save_pth,
diff --git a/app/processors/dynamic_router.py b/app/processors/dynamic_router.py
index 76cbe0f..f54a9af 100644
--- a/app/processors/dynamic_router.py
+++ b/app/processors/dynamic_router.py
@@ -4,6 +4,24 @@ import re
 UNMATCHED_HEADER = "未匹配检测内容"
 UNASSIGNED_SHEET_NAME = "未归属检测内容"
 
+CATEGORY_REASON_KEYWORDS = {
+    "血细胞": ["血细胞", "血常规"],
+    "凝血": ["凝血"],
+    "肝功": ["肝功", "肾功", "电解质", "葡萄糖", "心肌酶"],
+    "各类肿瘤标志物": ["肿瘤", "标志物", "癌胚", "甲状旁腺", "降钙素", "鳞状细胞"],
+    "七抗": ["七抗", "自身抗体"],
+    "传染指标": ["传染"],
+    "血气分析+生化分析": ["血气"],
+    "感染指标": ["感染", "新冠", "冠状病毒", "结核", "细菌", "病毒", "HPV", "C反应蛋白", "降钙素原"],
+    "基因检测指标": ["基因", "CYP"],
+    "心衰系列": ["心衰", "B型", "BNP", "Pro-BNP", "钠尿肽", "肌钙蛋白"],
+    "普通指标": ["血型", "隐血", "卡式"],
+    "免疫系列": ["甲功", "甲状腺", "促甲状腺", "抗甲状腺"],
+    "特殊指标": ["细胞因子", "白介素", "血管内皮"],
+    "内分泌代谢系列": ["内分泌", "代谢", "儿茶酚胺", "ANCA"],
+    "用药指导": ["用药", "VKORC", "CYP2C9", "ALDH2", "ApoE", "SLCO"],
+}
+
 
 def match_re(value, pattern):
     return re.match(str(pattern), str(value or "")) is not None
@@ -23,37 +41,75 @@ def detail_value(row):
     return ""
 
 
-def route_detail_rows(detail_rows, all_tests):
+def route_detail_rows(detail_rows, all_tests, reason=""):
     sheet_results = {}
     unassigned_items = []
+    tests_by_name = {test["test_check_name"]: test for test in all_tests}
+    candidates_by_row = []
+    candidate_counts = {}
 
     for detail_row in detail_rows:
+        candidates = _match_candidates(detail_row, all_tests)
+        candidates_by_row.append((detail_row, candidates))
+        for candidate in candidates:
+            candidate_counts[candidate["sheet_name"]] = candidate_counts.get(candidate["sheet_name"], 0) + 1
+
+    for detail_row, candidates in candidates_by_row:
         item_name = detail_row.get("rpt_itemname", "")
-        matched_any = False
-
-        for test in all_tests:
-            test_result_col_name = test["test_result_col_name"]
-            test_check_list = test["test_check_list"]
-            test_check_list_all = test["test_check_list_all"]
-
-            for index, checks in enumerate(test_check_list_all):
-                if isinstance(checks, str):
-                    checks = [checks]
-                if any(match_re(item_name, pattern) for pattern in checks):
-                    sheet_name = test["test_check_name"]
-                    result_name = test_check_list[index]
-                    sheet_results.setdefault(sheet_name, {})[result_name] = clean_result(
-                        detail_row.get(test_result_col_name, "")
-                    )
-                    matched_any = True
-                    break
-
-        if not matched_any:
+        if not candidates:
             unassigned_items.append(f"{item_name}：{detail_value(detail_row)}")
+            continue
+
+        candidate = _choose_candidate(candidates, reason, candidate_counts)
+        test = tests_by_name[candidate["sheet_name"]]
+        sheet_results.setdefault(candidate["sheet_name"], {})[candidate["result_name"]] = clean_result(
+            detail_row.get(test["test_result_col_name"], "")
+        )
 
     return sheet_results, unassigned_items
 
 
+def _match_candidates(detail_row, all_tests):
+    item_name = detail_row.get("rpt_itemname", "")
+    candidates = []
+
+    for test_index, test in enumerate(all_tests):
+        test_check_list = test["test_check_list"]
+        test_check_list_all = test["test_check_list_all"]
+
+        for item_index, checks in enumerate(test_check_list_all):
+            if isinstance(checks, str):
+                checks = [checks]
+            if any(match_re(item_name, pattern) for pattern in checks):
+                candidates.append(
+                    {
+                        "sheet_name": test["test_check_name"],
+                        "result_name": test_check_list[item_index],
+                        "test_index": test_index,
+                        "item_index": item_index,
+                    }
+                )
+                break
+
+    return candidates
+
+
+def _choose_candidate(candidates, reason, candidate_counts):
+    if len(candidates) == 1:
+        return candidates[0]
+
+    reason = str(reason or "")
+
+    def score(candidate):
+        sheet_name = candidate["sheet_name"]
+        keywords = CATEGORY_REASON_KEYWORDS.get(sheet_name, [])
+        reason_score = 100 if any(keyword and keyword in reason for keyword in keywords) else 0
+        density_score = candidate_counts.get(sheet_name, 0)
+        return (reason_score, density_score, -candidate["test_index"], -candidate["item_index"])
+
+    return max(candidates, key=score)
+
+
 def append_routed_report(
     add_content_to_excel,
     result_save_path,
diff --git a/tests/verify_dynamic_routing.py b/tests/verify_dynamic_routing.py
index 8b37bad..afb3720 100644
--- a/tests/verify_dynamic_routing.py
+++ b/tests/verify_dynamic_routing.py
@@ -79,6 +79,22 @@ def build_fixture(root):
             "req_reason": "未知组合检测",
             "sampled_dt": "2026-01-01 08:10:00",
         },
+        {
+            "rptunitid": "161",
+            "rechkdt": "2026-01-01 10:15:00",
+            "reportid": "20260101-161-4",
+            "reporttype": "10",
+            "req_reason": "甲功七项（化学发光法）[复]",
+            "sampled_dt": "2026-01-01 08:15:00",
+        },
+        {
+            "rptunitid": "161",
+            "rechkdt": "2026-01-01 10:15:00",
+            "reportid": "20260101-161-4",
+            "reporttype": "10",
+            "req_reason": "甲功七项（化学发光法）[复]",
+            "sampled_dt": "2026-01-01 08:15:00",
+        },
     ]
     for row in reports:
         for header in REPORT_HEADERS:
@@ -90,6 +106,12 @@ def build_fixture(root):
         [{"reportid": reports[0]["reportid"], "rpt_itemname": "谷草转氨酶", "result_str": "21"}],
         [{"reportid": reports[1]["reportid"], "rpt_itemname": "B型前脑尿钠肽", "result_str": "57.20"}],
         [{"reportid": reports[2]["reportid"], "rpt_itemname": "神秘检测项目", "result_str": "42"}],
+        [
+            {"reportid": reports[3]["reportid"], "rpt_itemname": "甲状腺球蛋白", "result_str": "6.02"},
+        ],
+        [
+            {"reportid": reports[4]["reportid"], "rpt_itemname": "甲状腺球蛋白", "result_str": "6.02"},
+        ],
     ]
     detail_dir = root / "Tests_Detail_List" / patient_id
     for report, rows in zip(reports, details):
@@ -114,10 +136,13 @@ def assert_dynamic_routing(workbook_path):
     try:
         liver_rows = rows_for(workbook["肝功"])
         heart_rows = rows_for(workbook["心衰系列"])
+        immune_rows = rows_for(workbook["免疫系列"])
         summary_text = "\n".join("\t".join(row) for row in rows_for(workbook["未检测到内容汇总"]))
 
         liver_data = [row for row in liver_rows[1:] if row and row[0]]
         heart_data = [row for row in heart_rows[1:] if row and row[0]]
+        tumor_data = [row for row in rows_for(workbook["各类肿瘤标志物"])[1:] if row and row[0]]
+        immune_data = [row for row in immune_rows[1:] if row and row[0]]
 
         assert len(liver_data) == 1, liver_data
         assert liver_data[0][3] == "肝功十项[复]_电解质五项[复]_肾功三项[复]"
@@ -132,6 +157,9 @@ def assert_dynamic_routing(workbook_path):
         assert "神秘检测项目" in summary_text
         assert "谷草转氨酶" not in summary_text
         assert "B型前脑尿钠肽" not in summary_text
+        assert len(tumor_data) == 0, tumor_data
+        assert len(immune_data) == 1, immune_data
+        assert immune_data[0][3] == "甲功七项（化学发光法）[复]"
     finally:
         workbook.close()