KeywordRetriever/retriever/tools/keyword_find.py

109 lines
4.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# from docx import Document
import docx2txt
import re
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
"""
插入html标签高亮敏感词
:param paragraph: 待处理的段落
:param keyword_list: 列表形式的敏感词
:return: 标记后的段落
"""
marked_paragraph = paragraph
for keyword in keyword_list:
marked_paragraph = marked_paragraph.replace(keyword,
'<span style="background-color:yellow">' + keyword + '</span>')
return marked_paragraph
def docx_find(file_path: str, keyword_list: list) -> dict:
"""
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# doc_file = Document(file_path)
doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件
find_dict = {'file_name': file_path, 'find_list': []}
find_list = []
paragraph_keyword = []
# 使用 python-docx 模块的检索 已废弃
# # 检索所有段落
# for para in doc_file.paragraphs:
# this_para_keyword = []
# for keyword in keyword_list: # 可能存在性能问题
# if keyword in para.text and not this_para_keyword: # 如果keyword在para.text中且该段落没有被标记过
# find_list.append(para.text)
# this_para_keyword.append(keyword)
# elif keyword in para.text and this_para_keyword: # 如果keyword在para.text中且该段落已经被标记过
# this_para_keyword.append(keyword)
# else:
# continue
#
# if this_para_keyword:
# # 若该段落被标记过则将该段落的所有keyword加入到paragraph_keyword中
# paragraph_keyword.append(this_para_keyword)
# # 检索所有页眉
# for section in doc_file.sections:
# header = section.header
# if header is not None:
# for para in header.paragraphs:
# this_para_keyword = []
# for keyword in keyword_list:
# if keyword in para.text and not this_para_keyword:
# find_list.append(para.text)
# this_para_keyword.append(keyword)
# elif keyword in para.text and this_para_keyword:
# this_para_keyword.append(keyword)
# else:
# continue
for para in doc_text.split('\n'):
this_para_keyword = []
for keyword in keyword_list:
if keyword in para and not this_para_keyword:
# 如果敏感词在para中且该段落没有被标记过
find_list.append(para)
this_para_keyword.append(keyword)
elif keyword in para and this_para_keyword:
# 如果敏感词在para中且该段落已经被标记过
this_para_keyword.append(keyword)
else:
continue
if this_para_keyword:
# 若该段落中有敏感词则将该段落的所有keyword加入到paragraph_keyword中
paragraph_keyword.append(this_para_keyword)
print(len(find_list))
find_dict['find_list'] = find_list
find_dict['paragraph_keyword'] = paragraph_keyword
return find_dict
# print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出']))
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
"""
对指定单一文件进行敏感词查找
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# check file type
if file_path.endswith('.docx'):
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
for para in find_dict['find_list']:
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
# print(find_dict)
return find_dict
else:
return {'file_name': file_path, 'find_list': []}
if __name__ == '__main__':
print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))