feat: ppt, excel, pdf, txt 文件敏感词检索
This commit is contained in:
parent
e5deab6fa2
commit
d059585180
|
@ -1,6 +1,10 @@
|
|||
import docx2txt
|
||||
from pptx import Presentation
|
||||
import re
|
||||
|
||||
import PyPDF2
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
|
||||
"""
|
||||
|
@ -18,6 +22,8 @@ def insert_html_tag(paragraph: str, keyword_list: list) -> str:
|
|||
|
||||
def docx_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
docx 文件查找
|
||||
查找范围:所有内容 包含所有段落,页眉页脚,表格,文本框等
|
||||
:param file_path: 文件路径
|
||||
:param keyword_list: 列表形式的敏感词
|
||||
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
|
||||
|
@ -37,13 +43,132 @@ def docx_find(file_path: str, keyword_list: list) -> dict:
|
|||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(para)
|
||||
|
||||
print(len(find_list))
|
||||
# print(len(find_list))
|
||||
find_dict['find_list'] = find_list
|
||||
find_dict['paragraph_keyword'] = paragraph_keyword
|
||||
return find_dict
|
||||
|
||||
|
||||
# print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出']))
|
||||
def pptx_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
ppt 文件查找
|
||||
查找范围:所有页的所有文本框
|
||||
:param file_path:
|
||||
:param keyword_list:
|
||||
:return:
|
||||
"""
|
||||
prs = Presentation(file_path) # 打开PPT文件
|
||||
|
||||
text_content = ""
|
||||
find_list = []
|
||||
paragraph_keyword = []
|
||||
|
||||
for slide in prs.slides:
|
||||
for shape in slide.shapes:
|
||||
if shape.has_text_frame:
|
||||
for paragraph in shape.text_frame.paragraphs:
|
||||
for run in paragraph.runs:
|
||||
text_content += run.text + "\n"
|
||||
for para in text_content.split('\n'):
|
||||
para = re.sub(r'\s+', '', para)
|
||||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||||
|
||||
if this_para_keyword:
|
||||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(para)
|
||||
|
||||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||||
return find_dict
|
||||
|
||||
|
||||
def pdf_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
pdf 文件查找
|
||||
查找范围:所有页的所有段落,分页处敏感词可能会被分割无法查找到
|
||||
:param file_path:
|
||||
:param keyword_list:
|
||||
:return:
|
||||
"""
|
||||
find_list = []
|
||||
paragraph_keyword = []
|
||||
with open(file_path, 'rb') as pdf_file:
|
||||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||||
text_content = ""
|
||||
|
||||
for page_num in range(len(pdf_reader.pages)):
|
||||
page = pdf_reader.pages[page_num]
|
||||
page_text = page.extract_text().replace("\n", "").replace(" ", "")
|
||||
text_content += page_text + "\n"
|
||||
|
||||
for para in text_content.split('\n'):
|
||||
para = re.sub(r'\s+', '', para)
|
||||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||||
|
||||
if this_para_keyword:
|
||||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(para)
|
||||
|
||||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||||
return find_dict
|
||||
|
||||
|
||||
def excel_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
excel 文件查找
|
||||
查找范围;所有工作表的所有单元格;页眉页脚,文本框无法查找
|
||||
:param file_path:
|
||||
:param keyword_list:
|
||||
:return:
|
||||
"""
|
||||
find_list = [] # 存储包含关键字的单元格位置
|
||||
paragraph_keyword = [] # 存储包含关键字的单元格位置
|
||||
|
||||
# 读取Excel文件的所有工作表
|
||||
xls = pd.ExcelFile(file_path)
|
||||
|
||||
for sheet_name in xls.sheet_names:
|
||||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
|
||||
|
||||
# 遍历每个工作表的每个单元格
|
||||
for index, row in df.iterrows():
|
||||
for col_name, cell_value in row.items():
|
||||
if isinstance(cell_value, str):
|
||||
|
||||
this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value] # 查找该段落中的敏感词
|
||||
|
||||
if this_para_keyword:
|
||||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
|
||||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||||
|
||||
return find_dict
|
||||
|
||||
|
||||
def txt_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
txt 文件查找
|
||||
查找范围:所有段落
|
||||
:param file_path:
|
||||
:param keyword_list:
|
||||
:return:
|
||||
"""
|
||||
find_list = []
|
||||
paragraph_keyword = []
|
||||
with open(file_path, 'r') as txt_file:
|
||||
text_content = txt_file.read()
|
||||
|
||||
for para in text_content.split('\n'):
|
||||
para = re.sub(r'\s+', '', para)
|
||||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||||
|
||||
if this_para_keyword:
|
||||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(para)
|
||||
|
||||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||||
return find_dict
|
||||
|
||||
|
||||
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
对指定单一文件进行敏感词查找
|
||||
|
@ -52,7 +177,7 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
|
|||
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
|
||||
"""
|
||||
# check file type
|
||||
if file_path.endswith('.docx'):
|
||||
if file_path.endswith(('.docx', '.doc', '.dot')):
|
||||
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
|
@ -60,9 +185,38 @@ def util_keyword_find(file_path: str, keyword_list: list) -> dict:
|
|||
# print(find_dict)
|
||||
return find_dict
|
||||
|
||||
elif file_path.endswith(('.pptx', '.ppt', '.pot')):
|
||||
find_dict = pptx_find(file_path, keyword_list)
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||||
return find_dict
|
||||
|
||||
elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
|
||||
find_dict = excel_find(file_path, keyword_list)
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||||
return find_dict
|
||||
|
||||
elif file_path.endswith('.pdf'):
|
||||
find_dict = pdf_find(file_path, keyword_list)
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||||
return find_dict
|
||||
|
||||
elif file_path.endswith('.txt'):
|
||||
find_dict = txt_find(file_path, keyword_list)
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||||
return find_dict
|
||||
|
||||
else:
|
||||
return {'file_name': file_path, 'find_list': []}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
|
||||
# print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
|
||||
print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))
|
||||
|
|
Loading…
Reference in New Issue