feat: 优化 word 检测算法

This commit is contained in:
raiot 2023-08-24 19:59:36 +08:00
parent 0d05f22aab
commit e6baff6681
1 changed files with 4 additions and 13 deletions

View File

@ -61,21 +61,12 @@ def docx_find(file_path: str, keyword_list: list) -> dict:
# continue
for para in doc_text.split('\n'):
this_para_keyword = []
for keyword in keyword_list:
if keyword in para and not this_para_keyword:
# 如果敏感词在para中且该段落没有被标记过
find_list.append(para)
this_para_keyword.append(keyword)
elif keyword in para and this_para_keyword:
# 如果敏感词在para中且该段落已经被标记过
this_para_keyword.append(keyword)
else:
continue
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
# 若该段落中有敏感词则将该段落的所有keyword加入到paragraph_keyword中
# 若该段落中有敏感词则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
print(len(find_list))
find_dict['find_list'] = find_list
@ -94,7 +85,7 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
# check file type
if file_path.endswith('.docx'):
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
for para in find_dict['find_list']:
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
# print(find_dict)