From 1ddb4e83ddb5c9121460f7ce5d0734831a0fe874 Mon Sep 17 00:00:00 2001 From: raiot Date: Thu, 24 Aug 2023 20:03:27 +0800 Subject: [PATCH] =?UTF-8?q?chore:=20=E5=88=A0=E9=99=A4=E5=BA=9F=E5=BC=83?= =?UTF-8?q?=E7=9A=84=E5=BC=95=E7=94=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- retriever/tools/keyword_find.py | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/retriever/tools/keyword_find.py b/retriever/tools/keyword_find.py index 2085b7e..3d19b81 100644 --- a/retriever/tools/keyword_find.py +++ b/retriever/tools/keyword_find.py @@ -1,4 +1,3 @@ -# from docx import Document import docx2txt import re @@ -29,37 +28,6 @@ def docx_find(file_path: str, keyword_list: list) -> dict: find_list = [] paragraph_keyword = [] - # 使用 python-docx 模块的检索 已废弃 - # # 检索所有段落 - # for para in doc_file.paragraphs: - # this_para_keyword = [] - # for keyword in keyword_list: # 可能存在性能问题 - # if keyword in para.text and not this_para_keyword: # 如果keyword在para.text中,且该段落没有被标记过 - # find_list.append(para.text) - # this_para_keyword.append(keyword) - # elif keyword in para.text and this_para_keyword: # 如果keyword在para.text中,且该段落已经被标记过 - # this_para_keyword.append(keyword) - # else: - # continue - # - # if this_para_keyword: - # # 若该段落被标记过,则将该段落的所有keyword加入到paragraph_keyword中 - # paragraph_keyword.append(this_para_keyword) - # # 检索所有页眉 - # for section in doc_file.sections: - # header = section.header - # if header is not None: - # for para in header.paragraphs: - # this_para_keyword = [] - # for keyword in keyword_list: - # if keyword in para.text and not this_para_keyword: - # find_list.append(para.text) - # this_para_keyword.append(keyword) - # elif keyword in para.text and this_para_keyword: - # this_para_keyword.append(keyword) - # else: - # continue - for para in doc_text.split('\n'): this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词