feat: ppt, excel, pdf, txt 文件敏感词检索

This commit is contained in:
raiot 2023-08-24 23:16:56 +08:00
parent e5deab6fa2
commit d059585180
1 changed files with 158 additions and 4 deletions

View File

@ -1,6 +1,10 @@
import docx2txt
from pptx import Presentation
import re
import PyPDF2
import pandas as pd
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
"""
@ -18,6 +22,8 @@ def insert_html_tag(paragraph: str, keyword_list: list) -> str:
def docx_find(file_path: str, keyword_list: list) -> dict:
"""
docx 文件查找
查找范围所有内容 包含所有段落页眉页脚表格文本框等
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
@ -37,13 +43,132 @@ def docx_find(file_path: str, keyword_list: list) -> dict:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
print(len(find_list))
# print(len(find_list))
find_dict['find_list'] = find_list
find_dict['paragraph_keyword'] = paragraph_keyword
return find_dict
# print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出']))
def pptx_find(file_path: str, keyword_list: list) -> dict:
"""
ppt 文件查找
查找范围所有页的所有文本框
:param file_path:
:param keyword_list:
:return:
"""
prs = Presentation(file_path) # 打开PPT文件
text_content = ""
find_list = []
paragraph_keyword = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_content += run.text + "\n"
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def pdf_find(file_path: str, keyword_list: list) -> dict:
"""
pdf 文件查找
查找范围所有页的所有段落分页处敏感词可能会被分割无法查找到
:param file_path:
:param keyword_list:
:return:
"""
find_list = []
paragraph_keyword = []
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text_content = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text().replace("\n", "").replace(" ", "")
text_content += page_text + "\n"
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def excel_find(file_path: str, keyword_list: list) -> dict:
"""
excel 文件查找
查找范围所有工作表的所有单元格页眉页脚文本框无法查找
:param file_path:
:param keyword_list:
:return:
"""
find_list = [] # 存储包含关键字的单元格位置
paragraph_keyword = [] # 存储包含关键字的单元格位置
# 读取Excel文件的所有工作表
xls = pd.ExcelFile(file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
# 遍历每个工作表的每个单元格
for index, row in df.iterrows():
for col_name, cell_value in row.items():
if isinstance(cell_value, str):
this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def txt_find(file_path: str, keyword_list: list) -> dict:
"""
txt 文件查找
查找范围所有段落
:param file_path:
:param keyword_list:
:return:
"""
find_list = []
paragraph_keyword = []
with open(file_path, 'r') as txt_file:
text_content = txt_file.read()
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
"""
对指定单一文件进行敏感词查找
@ -52,7 +177,7 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
:return: 返回一个字典包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# check file type
if file_path.endswith('.docx'):
if file_path.endswith(('.docx', '.doc', '.dot')):
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
@ -60,9 +185,38 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
# print(find_dict)
return find_dict
elif file_path.endswith(('.pptx', '.ppt', '.pot')):
find_dict = pptx_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
find_dict = excel_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith('.pdf'):
find_dict = pdf_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith('.txt'):
find_dict = txt_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
else:
return {'file_name': file_path, 'find_list': []}
if __name__ == '__main__':
print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
# print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))