223 lines
8.4 KiB
Python
223 lines
8.4 KiB
Python
import docx2txt
|
||
from pptx import Presentation
|
||
import re
|
||
|
||
import PyPDF2
|
||
import pandas as pd
|
||
|
||
|
||
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
|
||
"""
|
||
插入html标签高亮敏感词
|
||
:param paragraph: 待处理的段落
|
||
:param keyword_list: 列表形式的敏感词
|
||
:return: 标记后的段落
|
||
"""
|
||
marked_paragraph = paragraph
|
||
for keyword in keyword_list:
|
||
marked_paragraph = marked_paragraph.replace(keyword,
|
||
'<span style="background-color:yellow">' + keyword + '</span>')
|
||
return marked_paragraph
|
||
|
||
|
||
def docx_find(file_path: str, keyword_list: list) -> dict:
|
||
"""
|
||
docx 文件查找
|
||
查找范围:所有内容 包含所有段落,页眉页脚,表格,文本框等
|
||
:param file_path: 文件路径
|
||
:param keyword_list: 列表形式的敏感词
|
||
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
|
||
"""
|
||
# doc_file = Document(file_path)
|
||
doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件
|
||
find_dict = {'file_name': file_path, 'find_list': []}
|
||
find_list = []
|
||
paragraph_keyword = []
|
||
|
||
for para in doc_text.split('\n'):
|
||
para = re.sub(r'\s+', '', para) # 删除段落中空格
|
||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||
|
||
if this_para_keyword:
|
||
# 若该段落中有敏感词,则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
|
||
paragraph_keyword.append(this_para_keyword)
|
||
find_list.append(para)
|
||
|
||
# print(len(find_list))
|
||
find_dict['find_list'] = find_list
|
||
find_dict['paragraph_keyword'] = paragraph_keyword
|
||
return find_dict
|
||
|
||
|
||
def pptx_find(file_path: str, keyword_list: list) -> dict:
|
||
"""
|
||
ppt 文件查找
|
||
查找范围:所有页的所有文本框
|
||
:param file_path:
|
||
:param keyword_list:
|
||
:return:
|
||
"""
|
||
prs = Presentation(file_path) # 打开PPT文件
|
||
|
||
text_content = ""
|
||
find_list = []
|
||
paragraph_keyword = []
|
||
|
||
for slide in prs.slides:
|
||
for shape in slide.shapes:
|
||
if shape.has_text_frame:
|
||
for paragraph in shape.text_frame.paragraphs:
|
||
for run in paragraph.runs:
|
||
text_content += run.text + "\n"
|
||
for para in text_content.split('\n'):
|
||
para = re.sub(r'\s+', '', para)
|
||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||
|
||
if this_para_keyword:
|
||
paragraph_keyword.append(this_para_keyword)
|
||
find_list.append(para)
|
||
|
||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||
return find_dict
|
||
|
||
|
||
def pdf_find(file_path: str, keyword_list: list) -> dict:
|
||
"""
|
||
pdf 文件查找
|
||
查找范围:所有页的所有段落,分页处敏感词可能会被分割无法查找到
|
||
:param file_path:
|
||
:param keyword_list:
|
||
:return:
|
||
"""
|
||
find_list = []
|
||
paragraph_keyword = []
|
||
with open(file_path, 'rb') as pdf_file:
|
||
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
||
text_content = ""
|
||
|
||
for page_num in range(len(pdf_reader.pages)):
|
||
page = pdf_reader.pages[page_num]
|
||
page_text = page.extract_text().replace("\n", "").replace(" ", "")
|
||
text_content += page_text + "\n"
|
||
|
||
for para in text_content.split('\n'):
|
||
para = re.sub(r'\s+', '', para)
|
||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||
|
||
if this_para_keyword:
|
||
paragraph_keyword.append(this_para_keyword)
|
||
find_list.append(para)
|
||
|
||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||
return find_dict
|
||
|
||
|
||
def excel_find(file_path: str, keyword_list: list) -> dict:
|
||
"""
|
||
excel 文件查找
|
||
查找范围;所有工作表的所有单元格;页眉页脚,文本框无法查找
|
||
:param file_path:
|
||
:param keyword_list:
|
||
:return:
|
||
"""
|
||
find_list = [] # 存储包含关键字的单元格位置
|
||
paragraph_keyword = [] # 存储包含关键字的单元格位置
|
||
|
||
# 读取Excel文件的所有工作表
|
||
xls = pd.ExcelFile(file_path)
|
||
|
||
for sheet_name in xls.sheet_names:
|
||
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
|
||
|
||
# 遍历每个工作表的每个单元格
|
||
for index, row in df.iterrows():
|
||
for col_name, cell_value in row.items():
|
||
if isinstance(cell_value, str):
|
||
|
||
this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value] # 查找该段落中的敏感词
|
||
|
||
if this_para_keyword:
|
||
paragraph_keyword.append(this_para_keyword)
|
||
find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
|
||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||
|
||
return find_dict
|
||
|
||
|
||
def txt_find(file_path: str, keyword_list: list) -> dict:
|
||
"""
|
||
txt 文件查找
|
||
查找范围:所有段落
|
||
:param file_path:
|
||
:param keyword_list:
|
||
:return:
|
||
"""
|
||
find_list = []
|
||
paragraph_keyword = []
|
||
with open(file_path, 'r') as txt_file:
|
||
text_content = txt_file.read()
|
||
|
||
for para in text_content.split('\n'):
|
||
para = re.sub(r'\s+', '', para)
|
||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||
|
||
if this_para_keyword:
|
||
paragraph_keyword.append(this_para_keyword)
|
||
find_list.append(para)
|
||
|
||
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
|
||
return find_dict
|
||
|
||
|
||
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
|
||
"""
|
||
对指定单一文件进行敏感词查找
|
||
:param file_path: 文件路径
|
||
:param keyword_list: 列表形式的敏感词
|
||
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
|
||
"""
|
||
# check file type
|
||
if file_path.endswith(('.docx', '.doc', '.dot')):
|
||
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
|
||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||
marked_para = insert_html_tag(para, keyword_list)
|
||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||
# print(find_dict)
|
||
return find_dict
|
||
|
||
elif file_path.endswith(('.pptx', '.ppt', '.pot')):
|
||
find_dict = pptx_find(file_path, keyword_list)
|
||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||
marked_para = insert_html_tag(para, keyword_list)
|
||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||
return find_dict
|
||
|
||
elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
|
||
find_dict = excel_find(file_path, keyword_list)
|
||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||
marked_para = insert_html_tag(para, keyword_list)
|
||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||
return find_dict
|
||
|
||
elif file_path.endswith('.pdf'):
|
||
find_dict = pdf_find(file_path, keyword_list)
|
||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||
marked_para = insert_html_tag(para, keyword_list)
|
||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||
return find_dict
|
||
|
||
elif file_path.endswith('.txt'):
|
||
find_dict = txt_find(file_path, keyword_list)
|
||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||
marked_para = insert_html_tag(para, keyword_list)
|
||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||
return find_dict
|
||
|
||
else:
|
||
return {'file_name': file_path, 'find_list': []}
|
||
|
||
|
||
if __name__ == '__main__':
|
||
# print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
|
||
print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))
|