Compare commits

...

No commits in common. "main" and "master" have entirely different histories.
main ... master

29 changed files with 12909 additions and 0 deletions

View File

@ -0,0 +1,3 @@
from .celery import app as celery_app
__all__ = ('celery_app',)

16
KeywordRetriever/asgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
ASGI config for KeywordRetriever project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
application = get_asgi_application()

View File

@ -0,0 +1,20 @@
import os
from celery import Celery
# 设置环境变量
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
# 实例化
app = Celery('KeywordRetriever')
# namespace='CELERY'作用是允许你在Django配置文件中对Celery进行配置
# 但所有Celery配置项必须以CELERY开头防止冲突
app.config_from_object('django.conf:settings', namespace='CELERY')
# 自动从Django的已注册app中发现任务
app.autodiscover_tasks()
# # 一个测试任务
# @app.task(bind=True)
# def debug_task(self):
# print(f'Request: {self.request!r}')

View File

@ -0,0 +1,163 @@
"""
Django settings for KeywordRetriever project.
Generated by 'django-admin startproject' using Django 4.2.4.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-&oj4h^8820e14zb_#p(-k8_pcd96y!hng&mre)*1i)owkk1+&d'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django_unicorn',
'django_celery_results',
'retriever',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'KeywordRetriever.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [BASE_DIR / 'templates']
,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'KeywordRetriever.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'zh-Hans'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
STATICFILES_DIRS = [
BASE_DIR / 'static',
]
STATIC_ROOT = os.path.join(BASE_DIR, 'static', 'static_root')
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CELERY_BROKER_URL = "redis://127.0.0.1:6379/0"
# celery时区设置建议与Django settings中TIME_ZONE同样时区防止时差
# Django设置时区需同时设置USE_TZ=True和TIME_ZONE = 'Asia/Shanghai'
CELERY_TIMEZONE = TIME_ZONE
# 为django_celery_results存储Celery任务执行结果设置后台
# 格式为db+scheme://user:password@host:port/dbname
# 支持数据库django-db和缓存django-cache存储任务状态及结果
CELERY_RESULT_BACKEND = "django-db"
# celery内容等消息的格式设置默认json
CELERY_ACCEPT_CONTENT = ['application/json', ]
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
# 为任务设置超时时间,单位秒。超时即中止,执行下个任务。
CELERY_TASK_TIME_LIMIT = 5
# 为存储结果设置过期日期默认1天过期。如果beat开启Celery每天会自动清除。
# 设为0存储结果永不过期
CELERY_RESULT_EXPIRES = 1
# 任务限流
CELERY_TASK_ANNOTATIONS = {'tasks.add': {'rate_limit': '10/s'}}
# Worker并发数量一般默认CPU核数可以不设置
CELERY_WORKER_CONCURRENCY = 2
# 每个worker执行了多少任务就会死掉默认是无限的
CELERY_WORKER_MAX_TASKS_PER_CHILD = 200

28
KeywordRetriever/urls.py Normal file
View File

@ -0,0 +1,28 @@
"""
URL configuration for KeywordRetriever project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path, include
import django_unicorn
urlpatterns = [
path('admin/', admin.site.urls),
path("unicorn/", include("django_unicorn.urls")),
path('', include('retriever.urls')),
]
admin.site.site_header = '敏感词维护'

16
KeywordRetriever/wsgi.py Normal file
View File

@ -0,0 +1,16 @@
"""
WSGI config for KeywordRetriever project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
application = get_wsgi_application()

22
manage.py Normal file
View File

@ -0,0 +1,22 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

BIN
requirements.txt Normal file

Binary file not shown.

0
retriever/__init__.py Normal file
View File

15
retriever/admin.py Normal file
View File

@ -0,0 +1,15 @@
from django.contrib import admin
from django.contrib.auth.models import Group, User
from django_celery_results.models import TaskResult, GroupResult
# Register your models here.
from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords
admin.site.register(Keywords)
# hide the user and group models
admin.site.unregister(Group)
admin.site.unregister(User)
admin.site.unregister(TaskResult)
admin.site.unregister(GroupResult)

6
retriever/apps.py Normal file
View File

@ -0,0 +1,6 @@
from django.apps import AppConfig
class RetrieverConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'retriever'

View File

View File

@ -0,0 +1,20 @@
from django.utils.timezone import now
from django_unicorn.components import UnicornView
from retriever.models import RetrieverTask
class ParagraphRenderView(UnicornView):
current_task = None
def mount(self):
print(self.component_args)
self.arg = self.component_args[0]
# self.current_task = RetrieverTask.objects.get(task_uuid=arg)
def get_paragraphs(self):
self.current_task = RetrieverTask.objects.get(task_uuid=self.arg)
print(self.current_task)
return RetrieverTask.objects.get(task_uuid=self.arg)

15
retriever/forms.py Normal file
View File

@ -0,0 +1,15 @@
from django import forms
from multiupload.fields import MultiFileField
from retriever.models import Keywords
class SpaceSeparatedField(forms.CharField):
def to_python(self, value):
if not value:
return []
return value.split(' ')
class UploadForm(forms.Form):
attachments = MultiFileField(min_num=1, max_num=10, max_file_size=1024 * 1024 * 64,
attrs={'class': 'file-input is-primary', 'accept': '.docx, .doc, .dot, .pptx, .ppt, .pdf, .xls, .xlsx, .txt', 'id': 'file-input'})

View File

@ -0,0 +1,61 @@
# Generated by Django 4.2.4 on 2023-08-25 14:14
import django.core.validators
from django.db import migrations, models
import django.db.models.deletion
import uuid
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Keywords',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(max_length=64, unique=True, verbose_name='敏感词')),
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
('keyword_created', models.DateTimeField(auto_now_add=True)),
],
options={
'verbose_name': '敏感词',
'verbose_name_plural': '敏感词',
},
),
migrations.CreateModel(
name='RetrieverTask',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('task_uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)),
('task_keywords', models.CharField(max_length=1024)),
('task_status', models.BooleanField(default=False)),
('task_started', models.BooleanField(default=False)),
('task_created', models.DateTimeField(auto_now_add=True)),
],
),
migrations.CreateModel(
name='UploadFile',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('file_id', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)),
('file_name', models.CharField(max_length=100)),
('is_checked', models.BooleanField(default=False)),
('file', models.FileField(upload_to='uploads/', validators=[django.core.validators.FileExtensionValidator(allowed_extensions=['docx'])])),
('related_task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='attachment', to='retriever.retrievertask')),
],
),
migrations.CreateModel(
name='KeywordParagraph',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(max_length=1024)),
('paragraph', models.TextField()),
('related_file', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='keyword_paragraph', to='retriever.uploadfile')),
],
),
]

View File

70
retriever/models.py Normal file
View File

@ -0,0 +1,70 @@
import uuid
from django.core.validators import FileExtensionValidator
from django.db import models
import time
# Create your models here.
class RetrieverTask(models.Model):
task_uuid = models.UUIDField(default=uuid.uuid4, unique=True, editable=False)
task_keywords = models.CharField(max_length=1024, blank=False, null=False)
task_status = models.BooleanField(default=False)
task_started = models.BooleanField(default=False) # 任务是否已经开始,避免重复执行
task_created = models.DateTimeField(auto_now_add=True)
def __str__(self):
return self.task_uuid.__str__()
class UploadFile(models.Model):
file_id = models.UUIDField(default=uuid.uuid4, unique=True, editable=False)
related_task = models.ForeignKey('RetrieverTask', related_name='attachment', on_delete=models.CASCADE)
file_name = models.CharField(max_length=100)
is_checked = models.BooleanField(default=False)
file = models.FileField(upload_to='uploads/', validators=[FileExtensionValidator(allowed_extensions=['docx'])])
def __str__(self):
return self.file_name
@property
def file_path(self):
return self.file.path
@property
def file_keyword_str(self):
# 以列表形式返回文件所属的所有段落的keyword
keyword_in_paragraph = self.keyword_paragraph.all().values_list('keyword', flat=True)
flat_keyword = [item.strip("[]' ") for sublist in keyword_in_paragraph for item in sublist.split(',')]
# print(flat_keyword)
# 排除重复的keyword 并返回以逗号分隔的字符串
return ''.join(set(flat_keyword))
# return self.keyword_paragraph.all().values_list('keyword', flat=True)
class KeywordParagraph(models.Model):
related_file = models.ForeignKey('UploadFile', related_name='keyword_paragraph', on_delete=models.CASCADE)
keyword = models.CharField(max_length=1024)
paragraph = models.TextField()
def __str__(self):
return self.keyword
class Keywords(models.Model):
keyword = models.CharField(max_length=64, unique=True, blank=False, null=False, verbose_name='敏感词')
is_active = models.BooleanField(default=True, verbose_name='是否启用')
keyword_created = models.DateTimeField(auto_now_add=True)
def __str__(self):
return self.keyword
@property
def active_keyword_list(self):
# 以列表形式返回所有is_active=True的keyword
return Keywords.objects.filter(is_active=True).values_list('keyword', flat=True)
class Meta:
verbose_name = '敏感词'
verbose_name_plural = '敏感词'

39
retriever/tasks.py Normal file
View File

@ -0,0 +1,39 @@
# app/tasks.py, 可以复用的task 改这个文件记得重启 Celery
import ast
import time
from celery import shared_task
from .models import RetrieverTask, UploadFile, KeywordParagraph
from .tools.keyword_find import util_keyword_find
@shared_task
def start_retriever_job(task_id):
current_task = RetrieverTask.objects.get(task_uuid=task_id)
task_keywords = ast.literal_eval(current_task.task_keywords) # 将字符串转换为list
# print(task_keywords)
for each in current_task.attachment.all():
if not each.is_checked:
try:
result_dict = util_keyword_find(each.file_path, task_keywords)
UploadFile.objects.filter(file_id=each.file_id).update(is_checked=True) # 更新is_checked字段
except Exception as e:
print(e)
result_dict = {'file_name': each.file_name, 'find_list': ['该文件检查程序出错,请联系管理员'], 'paragraph_keyword': ['出错']}
KeywordParagraph.objects.bulk_create(
[KeywordParagraph(related_file=each, keyword=para_keyword, paragraph=paragraph) for paragraph, para_keyword in
zip(result_dict['find_list'], result_dict['paragraph_keyword'])]) # 批量创建KeywordParagraph对象
elif each.is_checked:
continue
RetrieverTask.objects.filter(task_uuid=task_id).update(task_status=True)
time.sleep(600)
# delete task and related files
for each in current_task.attachment.all():
each.file.delete(save=True)
RetrieverTask.objects.filter(task_uuid=task_id).delete()
return task_id

View File

@ -0,0 +1,76 @@
{% extends 'base.html' %}
{% load unicorn %}
{% block navbar_item %}
<a href="/" class="button is-primary">
返回首页
</a>
{% endblock %}
{% block content %}
<div class="container">
<div class="content">
{# <h5 class="title is-6">查找关键词: <span class="tag is-danger is-light is-medium">{{ current_task.task_keywords }}</span></h5>#}
<div>
{% if not current_task.task_status %}
<b>搜索状态: <span class="tag is-danger is-light is-medium">正在检索</span></b>
<progress class="progress is-small is-primary" max="100">15%</progress>
{% endif %}
{% if current_task.task_status %}
<b>搜索状态: <span class="tag is-success is-light is-medium">已完成</span></b>
{% endif %}
<br>
</div>
{% for task_file in current_task.attachment.all %}
<div class="block">
<span class="tag is-info is-medium">
{{ task_file.file_name }}
</span>
{% if task_file.file_keyword_str %}
<b>检索到的敏感词:{{ task_file.file_keyword_str }}</b>
<br>
{% for paragraph in task_file.keyword_paragraph.all %}
<p><b>{{ forloop.counter }}.</b>{{ paragraph.paragraph | safe }}</p>
{% endfor %}
{% endif %}
{% if not task_file.file_keyword_str %}
<b>未检索到敏感词</b>
{% endif %}
</div>
{% endfor %}
</div>
</div>
{% endblock %}
{#{% block content %}#}
{# <div>#}
{# {% unicorn 'paragraph_render' current_task.task_uuid %}#}
{# </div>#}
{##}
{#{% endblock %}#}
{% block script %}
function monitorAndRefresh() {
// 获取要监控的值
const targetElement = "{{ current_task.task_status }}"; // 请将 "yourTargetElementId" 替换为实际的标签ID
if (targetElement !== "True") {
// 如果标签的值不是true等待两秒后刷新页面
setTimeout(function () {
location.reload();
}, 1000); // 1000毫秒1秒后刷新页面
}
}
// 调用监控函数
monitorAndRefresh();
{% endblock %}

View File

@ -0,0 +1,6 @@
<div>
<!-- put component code here -->
<div unicorn:poll-1000="get_paragraphs">{{ current_task.task_status }}</div>
</div>

3
retriever/tests.py Normal file
View File

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View File

@ -0,0 +1,222 @@
import docx2txt
from pptx import Presentation
import re
import PyPDF2
import pandas as pd
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
"""
插入html标签高亮敏感词
:param paragraph: 待处理的段落
:param keyword_list: 列表形式的敏感词
:return: 标记后的段落
"""
marked_paragraph = paragraph
for keyword in keyword_list:
marked_paragraph = marked_paragraph.replace(keyword,
'<span style="background-color:yellow">' + keyword + '</span>')
return marked_paragraph
def docx_find(file_path: str, keyword_list: list) -> dict:
"""
docx 文件查找
查找范围所有内容 包含所有段落页眉页脚表格文本框等
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# doc_file = Document(file_path)
doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件
find_dict = {'file_name': file_path, 'find_list': []}
find_list = []
paragraph_keyword = []
for para in doc_text.split('\n'):
para = re.sub(r'\s+', '', para) # 删除段落中空格
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
# 若该段落中有敏感词则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
# print(len(find_list))
find_dict['find_list'] = find_list
find_dict['paragraph_keyword'] = paragraph_keyword
return find_dict
def pptx_find(file_path: str, keyword_list: list) -> dict:
"""
ppt 文件查找
查找范围所有页的所有文本框
:param file_path:
:param keyword_list:
:return:
"""
prs = Presentation(file_path) # 打开PPT文件
text_content = ""
find_list = []
paragraph_keyword = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_content += run.text + "\n"
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def pdf_find(file_path: str, keyword_list: list) -> dict:
"""
pdf 文件查找
查找范围所有页的所有段落分页处敏感词可能会被分割无法查找到
:param file_path:
:param keyword_list:
:return:
"""
find_list = []
paragraph_keyword = []
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text_content = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text().replace("\n", "").replace(" ", "")
text_content += page_text + "\n"
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def excel_find(file_path: str, keyword_list: list) -> dict:
"""
excel 文件查找
查找范围所有工作表的所有单元格页眉页脚文本框无法查找
:param file_path:
:param keyword_list:
:return:
"""
find_list = [] # 存储包含关键字的单元格位置
paragraph_keyword = [] # 存储包含关键字的单元格位置
# 读取Excel文件的所有工作表
xls = pd.ExcelFile(file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
# 遍历每个工作表的每个单元格
for index, row in df.iterrows():
for col_name, cell_value in row.items():
if isinstance(cell_value, str):
this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def txt_find(file_path: str, keyword_list: list) -> dict:
"""
txt 文件查找
查找范围所有段落
:param file_path:
:param keyword_list:
:return:
"""
find_list = []
paragraph_keyword = []
with open(file_path, 'r') as txt_file:
text_content = txt_file.read()
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
"""
对指定单一文件进行敏感词查找
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# check file type
if file_path.endswith(('.docx', '.doc', '.dot')):
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
# print(find_dict)
return find_dict
elif file_path.endswith(('.pptx', '.ppt', '.pot')):
find_dict = pptx_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
find_dict = excel_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith('.pdf'):
find_dict = pdf_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith('.txt'):
find_dict = txt_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
else:
return {'file_name': file_path, 'find_list': []}
if __name__ == '__main__':
# print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))

7
retriever/urls.py Normal file
View File

@ -0,0 +1,7 @@
from django.urls import path, include
from . import views
urlpatterns = [
path('', views.KeywordRetrieverView.as_view(), name='index'),
path('<uuid:task_uuid>', views.TaskViewerView.as_view(), name='task_viewer')
]

47
retriever/views.py Normal file
View File

@ -0,0 +1,47 @@
from django.shortcuts import render
from django.urls import reverse_lazy
from django.views import View
from django.views.generic import FormView
from retriever.forms import UploadForm
from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords
from .tasks import start_retriever_job
# Create your views here.
class KeywordRetrieverView(FormView):
form_class = UploadForm
template_name = "index.html" # Replace with your template.
# def get(self, request, *args, **kwargs):
# form = FileFieldForm()
# return render(request, 'index.html', {'form': form})
def form_valid(self, form):
# 创建任务
keyword_list = list(Keywords.objects.filter(is_active=True).values_list('keyword', flat=True))
r_task = RetrieverTask.objects.create(task_keywords=keyword_list)
self.task_uuid = r_task.task_uuid
# print(r_task.task_keywords)
# 将上传的文件保存到数据库
for each in form.cleaned_data['attachments']:
UploadFile.objects.create(related_task=r_task, file_name=each.name, file=each)
return super(KeywordRetrieverView, self).form_valid(form)
def get_success_url(self):
# 重写success_url跳转到任务查看页面
return reverse_lazy('task_viewer', kwargs={'task_uuid': self.task_uuid})
class TaskViewerView(View):
def get(self, requests, task_uuid):
# print(self.kwargs['task_uuid'])
current_task = RetrieverTask.objects.get(task_uuid=task_uuid) # 获取当前任务
if not current_task.task_started:
# 如果任务未开始,则调用异步任务
current_task.task_started = True
current_task.save()
start_retriever_job.delay(current_task.task_uuid) # 调用异步任务
return render(requests, 'task_viewer.html', {'current_task': current_task})

11851
static/bulma.css vendored Normal file

File diff suppressed because it is too large Load Diff

1
static/bulma.css.map Normal file

File diff suppressed because one or more lines are too long

1
static/bulma.min.css vendored Normal file

File diff suppressed because one or more lines are too long

104
templates/base.html Normal file
View File

@ -0,0 +1,104 @@
{% load static %}
{% load unicorn %}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
<meta name="author" content="Aldi Duzha">
<meta name="description" content="Search houses and apartments for rent anywhere within the US. View floorplans, pricing, images and more. Find your perfect rental.">
<meta name="keywords" content="bulma, rent, template, apartments, page, website, free, awesome">
<title>寻章智搜</title>
<link rel="stylesheet" href="{% static 'fontawesomecss/css/all.min.css' %}">
<link rel="stylesheet" type="text/css" href="{% static 'bulma.min.css' %}">
{% unicorn_scripts %}
</head>
<body>
{% csrf_token %}
<div id="app">
<section class="hero is-fullheight is-light" >
<div class="hero-head">
<nav class="navbar is-transparent is-spaced" role="navigation" aria-label="main navigation">
<div class="container">
<div class="navbar-brand">
<a class="navbar-item" href="/">
{# <img src="https://bulma.io/images/bulma-logo.png" alt="Bulma Rent" width="80" height="20">#}
<span class="is-size-3 has-text-weight-semibold">寻章智搜 &bullet; </span>
<span class="is-size-3 has-text-weight-light" style="color: gray">直升机所保密办</span>
</a>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navbarTopMain">
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navbarTopMain">
<div class="navbar-end">
<div class="navbar-item">
{% block navbar_item %}
<a href="/admin" class="button is-primary">
敏感词库维护
</a>
{% endblock %}
</div>
</div>
</div>
</div>
</nav>
</div>
{% block content %}
{% endblock %}
</section>
<footer class="hero is-small is-light">
<div class="hero-body">
<div class="container has-text-centered">
<a href="https://raiot.me">
Made with <span class="has-text-danger"></span> by Raiot
</a>
<div class="columns m-t-10">
<div class="column">
<nav class="has-text-grey-light">
<a href="#" class="has-text-primary">About</a>
{# &bullet;#}
</nav>
</div>
</div>
<div class="b-t m-t-30 p-t-30 has-text-grey-light is-size-7">
Rent Template 2020 &copy; Aldi Duzha <br>
</div>
</div>
</div>
</footer>
</div>
<script>
document.addEventListener('DOMContentLoaded', () => {
// Get all "navbar-burger" elements
const $navbarBurgers = Array.prototype.slice.call(document.querySelectorAll('.navbar-burger'), 0);
// Check if there are any navbar burgers
if ($navbarBurgers.length > 0) {
// Add a click event on each of them
$navbarBurgers.forEach( el => {
el.addEventListener('click', () => {
// Get the target from the "data-target" attribute
const target = el.dataset.target;
const $target = document.getElementById(target);
// Toggle the "is-active" class on both the "navbar-burger" and the "navbar-menu"
el.classList.toggle('is-active');
$target.classList.toggle('is-active');
});
});
}
});
{% block script %}
{% endblock %}
</script>
</body>
</html>

97
templates/index.html Normal file
View File

@ -0,0 +1,97 @@
{% extends 'base.html' %}
{% load static %}
{% load unicorn %}
{% block navbar_item %}
<a href="/admin" class="button is-primary">
敏感词库维护
</a>
{% endblock %}
{% block content %}
<div class="hero-body p-b-30 ">
<div class="container">
<h2 class="subtitle">
<span class="has-text-centered is-block">
Sensitive Word Detection Tool for Documents
</span>
</h2>
<h1 class="title">
<span class="is-size-2 has-text-centered is-block">敏感词检测工具</span>
</h1>
<div class="container">
<div class="notification">
<span class="has-text-centered is-block">可批量上传,支持 .docx, .doc, .dot, .pptx, .ppt, .pdf, .xls, .xlsx, .txt(UTF-8) 格式文件</span>
<br>
<form method="POST" enctype="multipart/form-data">
{% csrf_token %}
<div class="columns">
<div class="column has-text-centered">
<div class="file is-boxed is-centered">
<label class="file-label">
{{ form.attachments }}
<span class="file-cta">
<span class="file-icon">
<i class="fas fa-upload"></i>
</span>
<span class="file-label">
选择文件
</span>
</span>
</label>
</div>
</div>
</div>
<div class="columns">
<div class="column"></div>
<div class="is-centered column" id="file-list">
</div>
<div class="column"></div>
</div>
<br>
<div class="is-centered" style="text-align:center">
<input type="submit" value="提交" class="button is-link is-medium is-center">
</div>
</form>
</div>
</div>
</div>
</div>
{% endblock %}
{% block script %}
{# show file name after file selected#}
// 获取文件选择输入框和文件名显示容器
const fileInput = document.getElementById('file-input');
const fileListDiv = document.getElementById('file-list');
// 添加事件监听器来处理文件选择
fileInput.addEventListener('change', function() {
// 获取所选文件列表
const selectedFiles = fileInput.files;
// 创建一个用于显示文件名的字符串
let fileListText = '所选文件:<br>';
// 遍历文件列表并添加文件名到字符串
for (let i = 0; i < selectedFiles.length; i++) {
const fileName = selectedFiles[i].name;
fileListText += `${fileName}<br>`;
}
// 将文件名字符串插入到页面上的容器中
fileListDiv.innerHTML = fileListText;
});
{% endblock %}