feat: 优化 word 检测算法
This commit is contained in:
parent
0d05f22aab
commit
e6baff6681
|
@ -61,21 +61,12 @@ def docx_find(file_path: str, keyword_list: list) -> dict:
|
|||
# continue
|
||||
|
||||
for para in doc_text.split('\n'):
|
||||
this_para_keyword = []
|
||||
for keyword in keyword_list:
|
||||
if keyword in para and not this_para_keyword:
|
||||
# 如果敏感词在para中,且该段落没有被标记过
|
||||
find_list.append(para)
|
||||
this_para_keyword.append(keyword)
|
||||
elif keyword in para and this_para_keyword:
|
||||
# 如果敏感词在para中,且该段落已经被标记过
|
||||
this_para_keyword.append(keyword)
|
||||
else:
|
||||
continue
|
||||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||||
|
||||
if this_para_keyword:
|
||||
# 若该段落中有敏感词,则将该段落的所有keyword加入到paragraph_keyword中
|
||||
# 若该段落中有敏感词,则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
|
||||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(para)
|
||||
|
||||
print(len(find_list))
|
||||
find_dict['find_list'] = find_list
|
||||
|
@ -94,7 +85,7 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
|
|||
# check file type
|
||||
if file_path.endswith('.docx'):
|
||||
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
|
||||
for para in find_dict['find_list']:
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||||
# print(find_dict)
|
||||
|
|
Loading…
Reference in New Issue