import docx2txt from pptx import Presentation import re import PyPDF2 import pandas as pd def insert_html_tag(paragraph: str, keyword_list: list) -> str: """ 插入html标签高亮敏感词 :param paragraph: 待处理的段落 :param keyword_list: 列表形式的敏感词 :return: 标记后的段落 """ marked_paragraph = paragraph for keyword in keyword_list: marked_paragraph = marked_paragraph.replace(keyword, '' + keyword + '') return marked_paragraph def docx_find(file_path: str, keyword_list: list) -> dict: """ docx 文件查找 查找范围:所有内容 包含所有段落,页眉页脚,表格,文本框等 :param file_path: 文件路径 :param keyword_list: 列表形式的敏感词 :return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]} """ # doc_file = Document(file_path) doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件 find_dict = {'file_name': file_path, 'find_list': []} find_list = [] paragraph_keyword = [] for para in doc_text.split('\n'): para = re.sub(r'\s+', '', para) # 删除段落中空格 this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词 if this_para_keyword: # 若该段落中有敏感词,则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中 paragraph_keyword.append(this_para_keyword) find_list.append(para) # print(len(find_list)) find_dict['find_list'] = find_list find_dict['paragraph_keyword'] = paragraph_keyword return find_dict def pptx_find(file_path: str, keyword_list: list) -> dict: """ ppt 文件查找 查找范围:所有页的所有文本框 :param file_path: :param keyword_list: :return: """ prs = Presentation(file_path) # 打开PPT文件 text_content = "" find_list = [] paragraph_keyword = [] for slide in prs.slides: for shape in slide.shapes: if shape.has_text_frame: for paragraph in shape.text_frame.paragraphs: for run in paragraph.runs: text_content += run.text + "\n" for para in text_content.split('\n'): para = re.sub(r'\s+', '', para) this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词 if this_para_keyword: paragraph_keyword.append(this_para_keyword) find_list.append(para) find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword} return find_dict def pdf_find(file_path: str, keyword_list: list) -> dict: """ pdf 文件查找 查找范围:所有页的所有段落,分页处敏感词可能会被分割无法查找到 :param file_path: :param keyword_list: :return: """ find_list = [] paragraph_keyword = [] with open(file_path, 'rb') as pdf_file: pdf_reader = PyPDF2.PdfReader(pdf_file) text_content = "" for page_num in range(len(pdf_reader.pages)): page = pdf_reader.pages[page_num] page_text = page.extract_text().replace("\n", "").replace(" ", "") text_content += page_text + "\n" for para in text_content.split('\n'): para = re.sub(r'\s+', '', para) this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词 if this_para_keyword: paragraph_keyword.append(this_para_keyword) find_list.append(para) find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword} return find_dict def excel_find(file_path: str, keyword_list: list) -> dict: """ excel 文件查找 查找范围;所有工作表的所有单元格;页眉页脚,文本框无法查找 :param file_path: :param keyword_list: :return: """ find_list = [] # 存储包含关键字的单元格位置 paragraph_keyword = [] # 存储包含关键字的单元格位置 # 读取Excel文件的所有工作表 xls = pd.ExcelFile(file_path) for sheet_name in xls.sheet_names: df = pd.read_excel(file_path, sheet_name=sheet_name, header=None) # 遍历每个工作表的每个单元格 for index, row in df.iterrows(): for col_name, cell_value in row.items(): if isinstance(cell_value, str): this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value] # 查找该段落中的敏感词 if this_para_keyword: paragraph_keyword.append(this_para_keyword) find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}") find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword} return find_dict def txt_find(file_path: str, keyword_list: list) -> dict: """ txt 文件查找 查找范围:所有段落 :param file_path: :param keyword_list: :return: """ find_list = [] paragraph_keyword = [] with open(file_path, 'r') as txt_file: text_content = txt_file.read() for para in text_content.split('\n'): para = re.sub(r'\s+', '', para) this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词 if this_para_keyword: paragraph_keyword.append(this_para_keyword) find_list.append(para) find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword} return find_dict def util_keyword_find(file_path: str, keyword_list: list) -> dict: """ 对指定单一文件进行敏感词查找 :param file_path: 文件路径 :param keyword_list: 列表形式的敏感词 :return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]} """ # check file type if file_path.endswith(('.docx', '.doc', '.dot')): find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数 for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']): marked_para = insert_html_tag(para, keyword_list) find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para # print(find_dict) return find_dict elif file_path.endswith(('.pptx', '.ppt', '.pot')): find_dict = pptx_find(file_path, keyword_list) for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']): marked_para = insert_html_tag(para, keyword_list) find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para return find_dict elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')): find_dict = excel_find(file_path, keyword_list) for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']): marked_para = insert_html_tag(para, keyword_list) find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para return find_dict elif file_path.endswith('.pdf'): find_dict = pdf_find(file_path, keyword_list) for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']): marked_para = insert_html_tag(para, keyword_list) find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para return find_dict elif file_path.endswith('.txt'): find_dict = txt_find(file_path, keyword_list) for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']): marked_para = insert_html_tag(para, keyword_list) find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para return find_dict else: return {'file_name': file_path, 'find_list': []} if __name__ == '__main__': # print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水'])) print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))