feat: ppt, excel, pdf, txt 文件敏感词检索

2023-08-24 23:16:56 +08:00 · 2023-08-24 23:16:56 +08:00 · d059585180
parent e5deab6fa2
commit d059585180
1 changed files with 158 additions and 4 deletions
--- a/retriever/tools/keyword_find.py
+++ b/retriever/tools/keyword_find.py
@ -1,6 +1,10 @@
 import docx2txt
+from pptx import Presentation
 import re

+import PyPDF2
+import pandas as pd
+

 def insert_html_tag(paragraph: str, keyword_list: list) -> str:
    """
@ -18,6 +22,8 @@ def insert_html_tag(paragraph: str, keyword_list: list) -> str:

 def docx_find(file_path: str, keyword_list: list) -> dict:
    """
+    docx 文件查找
+    查找范围：所有内容 包含所有段落，页眉页脚，表格，文本框等
    :param file_path: 文件路径
    :param keyword_list: 列表形式的敏感词
    :return: 返回一个字典，包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
@ -37,13 +43,132 @@ def docx_find(file_path: str, keyword_list: list) -> dict:
            paragraph_keyword.append(this_para_keyword)
            find_list.append(para)

-    print(len(find_list))
+    # print(len(find_list))
    find_dict['find_list'] = find_list
    find_dict['paragraph_keyword'] = paragraph_keyword
    return find_dict


-# print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出']))
+def pptx_find(file_path: str, keyword_list: list) -> dict:
+    """
+    ppt 文件查找
+    查找范围：所有页的所有文本框
+    :param file_path:
+    :param keyword_list:
+    :return:
+    """
+    prs = Presentation(file_path)  # 打开PPT文件
+
+    text_content = ""
+    find_list = []
+    paragraph_keyword = []
+
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if shape.has_text_frame:
+                for paragraph in shape.text_frame.paragraphs:
+                    for run in paragraph.runs:
+                        text_content += run.text + "\n"
+    for para in text_content.split('\n'):
+        para = re.sub(r'\s+', '', para)
+        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词
+
+        if this_para_keyword:
+            paragraph_keyword.append(this_para_keyword)
+            find_list.append(para)
+
+    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
+    return find_dict
+
+
+def pdf_find(file_path: str, keyword_list: list) -> dict:
+    """
+    pdf 文件查找
+    查找范围：所有页的所有段落，分页处敏感词可能会被分割无法查找到
+    :param file_path:
+    :param keyword_list:
+    :return:
+    """
+    find_list = []
+    paragraph_keyword = []
+    with open(file_path, 'rb') as pdf_file:
+        pdf_reader = PyPDF2.PdfReader(pdf_file)
+        text_content = ""
+
+        for page_num in range(len(pdf_reader.pages)):
+            page = pdf_reader.pages[page_num]
+            page_text = page.extract_text().replace("\n", "").replace(" ", "")
+            text_content += page_text + "\n"
+
+    for para in text_content.split('\n'):
+        para = re.sub(r'\s+', '', para)
+        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词
+
+        if this_para_keyword:
+            paragraph_keyword.append(this_para_keyword)
+            find_list.append(para)
+
+    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
+    return find_dict
+
+
+def excel_find(file_path: str, keyword_list: list) -> dict:
+    """
+    excel 文件查找
+    查找范围；所有工作表的所有单元格；页眉页脚，文本框无法查找
+    :param file_path:
+    :param keyword_list:
+    :return:
+    """
+    find_list = []  # 存储包含关键字的单元格位置
+    paragraph_keyword = []  # 存储包含关键字的单元格位置
+
+    # 读取Excel文件的所有工作表
+    xls = pd.ExcelFile(file_path)
+
+    for sheet_name in xls.sheet_names:
+        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
+
+        # 遍历每个工作表的每个单元格
+        for index, row in df.iterrows():
+            for col_name, cell_value in row.items():
+                if isinstance(cell_value, str):
+
+                    this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value]  # 查找该段落中的敏感词
+
+                    if this_para_keyword:
+                        paragraph_keyword.append(this_para_keyword)
+                        find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
+    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
+
+    return find_dict
+
+
+def txt_find(file_path: str, keyword_list: list) -> dict:
+    """
+    txt 文件查找
+    查找范围：所有段落
+    :param file_path:
+    :param keyword_list:
+    :return:
+    """
+    find_list = []
+    paragraph_keyword = []
+    with open(file_path, 'r') as txt_file:
+        text_content = txt_file.read()
+
+    for para in text_content.split('\n'):
+        para = re.sub(r'\s+', '', para)
+        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词
+
+        if this_para_keyword:
+            paragraph_keyword.append(this_para_keyword)
+            find_list.append(para)
+
+    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
+    return find_dict
+
+
 def util_keyword_find(file_path: str, keyword_list: list) -> dict:
    """
    对指定单一文件进行敏感词查找
@ -52,7 +177,7 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
    :return: 返回一个字典，包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
    """
    # check file type
-    if file_path.endswith('.docx'):
+    if file_path.endswith(('.docx', '.doc', '.dot')):
        find_dict = docx_find(file_path, keyword_list)  # 调用docx_find函数
        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
            marked_para = insert_html_tag(para, keyword_list)
@ -60,9 +185,38 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
        # print(find_dict)
        return find_dict

+    elif file_path.endswith(('.pptx', '.ppt', '.pot')):
+        find_dict = pptx_find(file_path, keyword_list)
+        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
+            marked_para = insert_html_tag(para, keyword_list)
+            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
+        return find_dict
+
+    elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
+        find_dict = excel_find(file_path, keyword_list)
+        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
+            marked_para = insert_html_tag(para, keyword_list)
+            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
+        return find_dict
+
+    elif file_path.endswith('.pdf'):
+        find_dict = pdf_find(file_path, keyword_list)
+        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
+            marked_para = insert_html_tag(para, keyword_list)
+            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
+        return find_dict
+
+    elif file_path.endswith('.txt'):
+        find_dict = txt_find(file_path, keyword_list)
+        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
+            marked_para = insert_html_tag(para, keyword_list)
+            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
+        return find_dict
+
    else:
        return {'file_name': file_path, 'find_list': []}


 if __name__ == '__main__':
-    print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
+    # print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
+    print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))