KeywordRetriever/retriever/tools/keyword_find.py

import docx2txt
from pptx import Presentation
import re

import PyPDF2
import pandas as pd


def insert_html_tag(paragraph: str, keyword_list: list) -> str:
    """
    插入html标签高亮敏感词
    :param paragraph: 待处理的段落
    :param keyword_list: 列表形式的敏感词
    :return: 标记后的段落
    """
    marked_paragraph = paragraph
    for keyword in keyword_list:
        marked_paragraph = marked_paragraph.replace(keyword,
                                                    '<span style="background-color:yellow">' + keyword + '</span>')
    return marked_paragraph


def docx_find(file_path: str, keyword_list: list) -> dict:
    """
    docx 文件查找
    查找范围：所有内容 包含所有段落，页眉页脚，表格，文本框等
    :param file_path: 文件路径
    :param keyword_list: 列表形式的敏感词
    :return: 返回一个字典，包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
    """
    # doc_file = Document(file_path)
    doc_text = docx2txt.process(file_path)  # 使用docx2txt模块读取docx文件
    find_dict = {'file_name': file_path, 'find_list': []}
    find_list = []
    paragraph_keyword = []

    for para in doc_text.split('\n'):
        para = re.sub(r'\s+', '', para)  # 删除段落中空格
        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词

        if this_para_keyword:
            # 若该段落中有敏感词，则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
            paragraph_keyword.append(this_para_keyword)
            find_list.append(para)

    # print(len(find_list))
    find_dict['find_list'] = find_list
    find_dict['paragraph_keyword'] = paragraph_keyword
    return find_dict


def pptx_find(file_path: str, keyword_list: list) -> dict:
    """
    ppt 文件查找
    查找范围：所有页的所有文本框
    :param file_path:
    :param keyword_list:
    :return:
    """
    prs = Presentation(file_path)  # 打开PPT文件

    text_content = ""
    find_list = []
    paragraph_keyword = []

    for slide in prs.slides:
        for shape in slide.shapes:
            if shape.has_text_frame:
                for paragraph in shape.text_frame.paragraphs:
                    for run in paragraph.runs:
                        text_content += run.text + "\n"
    for para in text_content.split('\n'):
        para = re.sub(r'\s+', '', para)
        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词

        if this_para_keyword:
            paragraph_keyword.append(this_para_keyword)
            find_list.append(para)

    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
    return find_dict


def pdf_find(file_path: str, keyword_list: list) -> dict:
    """
    pdf 文件查找
    查找范围：所有页的所有段落，分页处敏感词可能会被分割无法查找到
    :param file_path:
    :param keyword_list:
    :return:
    """
    find_list = []
    paragraph_keyword = []
    with open(file_path, 'rb') as pdf_file:
        pdf_reader = PyPDF2.PdfReader(pdf_file)
        text_content = ""

        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            page_text = page.extract_text().replace("\n", "").replace(" ", "")
            text_content += page_text + "\n"

    for para in text_content.split('\n'):
        para = re.sub(r'\s+', '', para)
        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词

        if this_para_keyword:
            paragraph_keyword.append(this_para_keyword)
            find_list.append(para)

    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
    return find_dict


def excel_find(file_path: str, keyword_list: list) -> dict:
    """
    excel 文件查找
    查找范围；所有工作表的所有单元格；页眉页脚，文本框无法查找
    :param file_path:
    :param keyword_list:
    :return:
    """
    find_list = []  # 存储包含关键字的单元格位置
    paragraph_keyword = []  # 存储包含关键字的单元格位置

    # 读取Excel文件的所有工作表
    xls = pd.ExcelFile(file_path)

    for sheet_name in xls.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)

        # 遍历每个工作表的每个单元格
        for index, row in df.iterrows():
            for col_name, cell_value in row.items():
                if isinstance(cell_value, str):

                    this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value]  # 查找该段落中的敏感词

                    if this_para_keyword:
                        paragraph_keyword.append(this_para_keyword)
                        find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}

    return find_dict


def txt_find(file_path: str, keyword_list: list) -> dict:
    """
    txt 文件查找
    查找范围：所有段落
    :param file_path:
    :param keyword_list:
    :return:
    """
    find_list = []
    paragraph_keyword = []
    with open(file_path, 'r') as txt_file:
        text_content = txt_file.read()

    for para in text_content.split('\n'):
        para = re.sub(r'\s+', '', para)
        this_para_keyword = [keyword for keyword in keyword_list if keyword in para]  # 查找该段落中的敏感词

        if this_para_keyword:
            paragraph_keyword.append(this_para_keyword)
            find_list.append(para)

    find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
    return find_dict


def util_keyword_find(file_path: str, keyword_list: list) -> dict:
    """
    对指定单一文件进行敏感词查找
    :param file_path: 文件路径
    :param keyword_list: 列表形式的敏感词
    :return: 返回一个字典，包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
    """
    # check file type
    if file_path.endswith(('.docx', '.doc', '.dot')):
        find_dict = docx_find(file_path, keyword_list)  # 调用docx_find函数
        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
            marked_para = insert_html_tag(para, keyword_list)
            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
        # print(find_dict)
        return find_dict

    elif file_path.endswith(('.pptx', '.ppt', '.pot')):
        find_dict = pptx_find(file_path, keyword_list)
        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
            marked_para = insert_html_tag(para, keyword_list)
            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
        return find_dict

    elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
        find_dict = excel_find(file_path, keyword_list)
        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
            marked_para = insert_html_tag(para, keyword_list)
            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
        return find_dict

    elif file_path.endswith('.pdf'):
        find_dict = pdf_find(file_path, keyword_list)
        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
            marked_para = insert_html_tag(para, keyword_list)
            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
        return find_dict

    elif file_path.endswith('.txt'):
        find_dict = txt_find(file_path, keyword_list)
        for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
            marked_para = insert_html_tag(para, keyword_list)
            find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
        return find_dict

    else:
        return {'file_name': file_path, 'find_list': []}


if __name__ == '__main__':
    # print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
    print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))