import docx2txt
import re
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
"""
插入html标签高亮敏感词
:param paragraph: 待处理的段落
:param keyword_list: 列表形式的敏感词
:return: 标记后的段落
"""
marked_paragraph = paragraph
for keyword in keyword_list:
marked_paragraph = marked_paragraph.replace(keyword,
'' + keyword + '')
return marked_paragraph
def docx_find(file_path: str, keyword_list: list) -> dict:
"""
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# doc_file = Document(file_path)
doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件
find_dict = {'file_name': file_path, 'find_list': []}
find_list = []
paragraph_keyword = []
for para in doc_text.split('\n'):
para = re.sub(r'\s+', '', para) # 删除段落中空格
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
# 若该段落中有敏感词,则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
print(len(find_list))
find_dict['find_list'] = find_list
find_dict['paragraph_keyword'] = paragraph_keyword
return find_dict
# print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出']))
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
"""
对指定单一文件进行敏感词查找
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# check file type
if file_path.endswith('.docx'):
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
# print(find_dict)
return find_dict
else:
return {'file_name': file_path, 'find_list': []}
if __name__ == '__main__':
print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))