找出上传文件中,姓名一列有重复的序号,并予以标注
Created on: January 22, 2026
Answered using GPT-5.2 Thinking by Chat01
Created on: January 22, 2026
Answered using GPT-5.2 Thinking by Chat01
pythonimport openpyxl, os, pandas as pd, re, math, json, statistics path="/mnt/data/2025年度企业参与研发项目科技人员汇总表.xlsx" wb=openpyxl.load_workbook(path) wb.sheetnames ['Sheet1']
pythonws=wb["Sheet1"] # find max row/col with data max_row=ws.max_row max_col=ws.max_column max_row, max_col (253, 15)
pythondef row_values(r, cmax=15): return [ws.cell(r,c).value for c in range(1,cmax+1)] for r in range(1,11): print(r, row_values(r, 15))
pythonimport re, collections name_col=4 serial_col=1 duplicates=collections.defaultdict(list) for r in range(3, max_row+1): name=ws.cell(r,name_col).value serial=ws.cell(r,serial_col).value if name is None or str(name).strip()=="": continue key=re.sub(r'[\s\u3000]+', '', str(name).strip()) duplicates[key].append((r, serial, name)) dup_items={k:v for k,v in duplicates.items() if len(v)>1} len(dup_items), list(dup_items.items())[:3] (63, [('路严', [(3, 1, '路 严'), (152, 150, '路严')]), ('崔恒庆', [(4, 2, '崔恒庆'), (161, 159, '崔恒庆')]), ('杨兴坤', [(5, 3, '杨兴坤'), (155, 153, '杨兴坤')])])
pythonheader=row_values(2, max_col) list(enumerate(header, start=1)) [(1, '序号'), (2, '单位'), (3, '部门、车间'), (4, '姓名'), (5, '性别'), (6, '身份证号'), (7, '职务(工种)'), (8, '参加工作时间'), (9, '毕业院校'), (10, '专业'), (11, '学历\n(文化程度)'), (12, '职称\n(技术等级)'), (13, '人员分类\n(干/工)'), (14, '参与研发项目'), (15, None)]
pythonfrom openpyxl.utils import get_column_letter dim=ws.column_dimensions[get_column_letter(14)].width dim15=ws.column_dimensions[get_column_letter(15)].width dim, dim15 (12.0, 13.0)
pythonfrom copy import copy h14=ws.cell(2,14) h15=ws.cell(2,15) h14.fill.patternType, h14.font.bold, h14.alignment.wrap_text, h14.border.left.style, h14.border.right.style, h15.value (None, True, True, 'thin', 'thin', None)
pythond14=ws.cell(3,14) d15=ws.cell(3,15) (d14.border.left.style, d14.alignment.horizontal, d14.alignment.wrap_text, d15.border.left.style, d15.value) ('thin', 'center', True, None, None)
pythonws.merged_cells.ranges {<MergedCellRange A1:M1>}
pythonimport openpyxl, re, collections, os from openpyxl.styles import PatternFill, Font from openpyxl.utils import get_column_letter from copy import copy import pandas as pd in_path = "/mnt/data/2025年度企业参与研发项目科技人员汇总表.xlsx" wb = openpyxl.load_workbook(in_path) ws = wb.active # --- Find header row and columns --- header_row = None name_col = None serial_col = None for r in range(1, min(ws.max_row, 30) + 1): row = [ws.cell(r, c).value for c in range(1, ws.max_column + 1)] if "姓名" in row and "序号" in row: header_row = r name_col = row.index("姓名") + 1 serial_col = row.index("序号") + 1 break if header_row is None: raise ValueError("未找到包含“姓名”和“序号”的表头行。") data_start = header_row + 1 # Pick a marking column (use first completely blank header cell if available, else append) mark_col = None for c in range(1, ws.max_column + 1): if ws.cell(header_row, c).value in (None, ""): mark_col = c break if mark_col is None: mark_col = ws.max_column + 1 # --- Build duplicate map (normalize by removing all whitespace, incl. fullwidth) --- dups = collections.defaultdict(list) for r in range(data_start, ws.max_row + 1): name = ws.cell(r, name_col).value serial = ws.cell(r, serial_col).value if name is None or str(name).strip() == "": continue key = re.sub(r'[\s\u3000]+', '', str(name).strip()) dups[key].append((r, serial, name)) dup_items = {k: v for k, v in dups.items() if len(v) > 1} # --- Styles --- # highlight: light red fill similar to Excel "Light Red Fill with Dark Red Text" dup_fill = PatternFill(fill_type="solid", fgColor="FFC7CE") dup_font = Font(color="9C0006") # dark red # Mark column header ws.cell(header_row, mark_col).value = "姓名重复" # Copy header style from previous visible header cell (prefer the one right before mark_col if exists) src_header_cell = ws.cell(header_row, max(1, mark_col - 1)) tgt_header_cell = ws.cell(header_row, mark_col) tgt_header_cell.font = copy(src_header_cell.font) tgt_header_cell.fill = copy(src_header_cell.fill) tgt_header_cell.border = copy(src_header_cell.border) tgt_header_cell.alignment = copy(src_header_cell.alignment) tgt_header_cell.number_format = src_header_cell.number_format tgt_header_cell.protection = copy(src_header_cell.protection) # Ensure mark column cells match data row style (copy from previous column in same row) for r in range(data_start, ws.max_row + 1): # only style rows that appear to have data in serial or name if ws.cell(r, serial_col).value is None and ws.cell(r, name_col).value is None: continue src = ws.cell(r, max(1, mark_col - 1)) tgt = ws.cell(r, mark_col) tgt.font = copy(src.font) tgt.fill = copy(src.fill) tgt.border = copy(src.border) tgt.alignment = copy(src.alignment) tgt.number_format = src.number_format tgt.protection = copy(src.protection) # Mark duplicates and highlight for key, rows in dup_items.items(): for (r, serial, name) in rows: ws.cell(r, mark_col).value = "重复" # highlight serial and name for c in (serial_col, name_col): cell = ws.cell(r, c) cell.fill = dup_fill # Keep existing font bold/size but make text dark red for visibility new_font = copy(cell.font) new_font.color = dup_font.color cell.font = new_font # --- Add a summary sheet --- summary_name = "重复姓名汇总" if summary_name in wb.sheetnames: del wb[summary_name] sum_ws = wb.create_sheet(summary_name) sum_ws.append(["姓名(去空格)", "出现次数", "序号列表", "行号列表", "姓名原样(示例)"]) for key in sorted(dup_items.keys(), key=lambda x: x): rows = dup_items[key] serials = [str(s) for (_, s, _) in rows if s is not None] rownums = [str(r) for (r, _, _) in rows] samples = sorted({str(n) for (_, _, n) in rows if n is not None}) sum_ws.append([key, len(rows), "、".join(serials), "、".join(rownums), ";".join(samples)]) # Format summary header hdr_fill = PatternFill(fill_type="solid", fgColor="D9E1F2") hdr_font = Font(bold=True) for c in range(1, 6): cell = sum_ws.cell(1, c) cell.fill = hdr_fill cell.font = hdr_font sum_ws.freeze_panes = "A2" # Set column widths for readability widths = [18, 10, 30, 22, 30] for i, w in enumerate(widths, start=1): sum_ws.column_dimensions[get_column_letter(i)].width = w # Save output out_path = "/mnt/data/2025年度企业参与研发项目科技人员汇总表_姓名重复已标注.xlsx" wb.save(out_path) # Build a DataFrame for quick display in chat UI (one row per duplicated name) rows_for_df = [] for key, rows in sorted(dup_items.items(), key=lambda kv: kv[0]): rows_for_df.append({ "姓名(去空格)": key, "出现次数": len(rows), "序号列表": "、".join([str(s) for (_, s, _) in rows if s is not None]), }) df = pd.DataFrame(rows_for_df) from caas_jupyter_tools import display_dataframe_to_user display_dataframe_to_user("姓名重复汇总(按“姓名”去空格后识别)", df) out_path '/mnt/data/2025年度企业参与研发项目科技人员汇总表_姓名重复已标注.xlsx'
pythonfrom artifact_tool import SpreadsheetArtifact out_path="/mnt/data/2025年度企业参与研发项目科技人员汇总表_姓名重复已标注.xlsx" artifact = SpreadsheetArtifact.load(out_path) artifact.recalculate() artifact.save(out_path) # render first sheet and summary for quick visual inspection rendered = artifact.render(sheet_name="Sheet1") rendered2 = artifact.render(sheet_name="重复姓名汇总") rendered, rendered2[:2] if isinstance(rendered2, list) else type(rendered2)
pythonfrom artifact_tool import SpreadsheetArtifact out_path="/mnt/data/2025年度企业参与研发项目科技人员汇总表_姓名重复已标注.xlsx" artifact=SpreadsheetArtifact.load(out_path) artifact.calculate() exported=artifact.export(out_path, overwrite=True) exported PosixPath('/mnt/data/2025年度企业参与研发项目科技人员汇总表_姓名重复已标注.xlsx')
pythonsheet=artifact.sheet("Sheet1") rng=sheet.minimum_range() img_path=rng.render() img_path
pythonsheet=artifact.sheet("Sheet1") img_path=sheet.range("A1:O40").render() img_path PosixPath('/tmp/granola_render_spreadsheet6959gelp/Sheet1__range-A1-O40.png')
pythonimg2=artifact.sheet("重复姓名汇总").range("A1:E25").render() str(img2) '/tmp/granola_render_spreadsheet4a6essx4/___range-A1-E25.png'
pythonimg2=artifact.sheet("重复姓名汇总").range("A1:E25").render() str(img2) '/tmp/granola_render_spreadsheet4a6essx4/___range-A1-E25.png' <<ImageDisplayed>>
pythondup_name_count=len(dup_items) dup_row_count=sum(len(v) for v in dup_items.values()) dup_name_count, dup_row_count (63, 159)
已在你上传的表里把**“姓名”列重复对应的“序号”**找出来并完成标注了(按“姓名去掉空格/全角空格后”识别重复)。
共识别到 63 个重复姓名,涉及 159 行被标注。