import docx2txt import re def insert_html_tag(paragraph: str, keyword_list: list) -> str: """ 插入html标签高亮敏感词 :param paragraph: 待处理的段落 :param keyword_list: 列表形式的敏感词 :return: 标记后的段落 """ marked_paragraph = paragraph for keyword in keyword_list: marked_paragraph = marked_paragraph.replace(keyword, '' + keyword + '') return marked_paragraph def docx_find(file_path: str, keyword_list: list) -> dict: """ :param file_path: 文件路径 :param keyword_list: 列表形式的敏感词 :return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]} """ # doc_file = Document(file_path) doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件 find_dict = {'file_name': file_path, 'find_list': []} find_list = [] paragraph_keyword = [] for para in doc_text.split('\n'): para = re.sub(r'\s+', '', para) # 删除段落中空格 this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词 if this_para_keyword: # 若该段落中有敏感词,则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中 paragraph_keyword.append(this_para_keyword) find_list.append(para) print(len(find_list)) find_dict['find_list'] = find_list find_dict['paragraph_keyword'] = paragraph_keyword return find_dict # print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出'])) def util_keyword_find(file_path: str, keyword_list: list) -> dict: """ 对指定单一文件进行敏感词查找 :param file_path: 文件路径 :param keyword_list: 列表形式的敏感词 :return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]} """ # check file type if file_path.endswith('.docx'): find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数 for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']): marked_para = insert_html_tag(para, keyword_list) find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para # print(find_dict) return find_dict else: return {'file_name': file_path, 'find_list': []} if __name__ == '__main__': print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))