Compare commits

...

No commits in common. "master" and "main" have entirely different histories.
master ... main

29 changed files with 0 additions and 12909 deletions

View File

@ -1,3 +0,0 @@
from .celery import app as celery_app
__all__ = ('celery_app',)

View File

@ -1,16 +0,0 @@
"""
ASGI config for KeywordRetriever project.
It exposes the ASGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
"""
import os
from django.core.asgi import get_asgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
application = get_asgi_application()

View File

@ -1,20 +0,0 @@
import os
from celery import Celery
# 设置环境变量
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
# 实例化
app = Celery('KeywordRetriever')
# namespace='CELERY'作用是允许你在Django配置文件中对Celery进行配置
# 但所有Celery配置项必须以CELERY开头防止冲突
app.config_from_object('django.conf:settings', namespace='CELERY')
# 自动从Django的已注册app中发现任务
app.autodiscover_tasks()
# # 一个测试任务
# @app.task(bind=True)
# def debug_task(self):
# print(f'Request: {self.request!r}')

View File

@ -1,163 +0,0 @@
"""
Django settings for KeywordRetriever project.
Generated by 'django-admin startproject' using Django 4.2.4.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/topics/settings/
For the full list of settings and their values, see
https://docs.djangoproject.com/en/4.2/ref/settings/
"""
from pathlib import Path
import os
# Build paths inside the project like this: BASE_DIR / 'subdir'.
BASE_DIR = Path(__file__).resolve().parent.parent
# Quick-start development settings - unsuitable for production
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
# SECURITY WARNING: keep the secret key used in production secret!
SECRET_KEY = 'django-insecure-&oj4h^8820e14zb_#p(-k8_pcd96y!hng&mre)*1i)owkk1+&d'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
ALLOWED_HOSTS = []
# Application definition
INSTALLED_APPS = [
'django.contrib.admin',
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django_unicorn',
'django_celery_results',
'retriever',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'django.middleware.clickjacking.XFrameOptionsMiddleware',
]
ROOT_URLCONF = 'KeywordRetriever.urls'
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [BASE_DIR / 'templates']
,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'KeywordRetriever.wsgi.application'
# Database
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': BASE_DIR / 'db.sqlite3',
}
}
# Password validation
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
AUTH_PASSWORD_VALIDATORS = [
{
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
},
{
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
},
]
# Internationalization
# https://docs.djangoproject.com/en/4.2/topics/i18n/
LANGUAGE_CODE = 'zh-Hans'
TIME_ZONE = 'Asia/Shanghai'
USE_I18N = True
USE_TZ = True
# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/4.2/howto/static-files/
STATIC_URL = 'static/'
STATICFILES_DIRS = [
BASE_DIR / 'static',
]
STATIC_ROOT = os.path.join(BASE_DIR, 'static', 'static_root')
# Default primary key field type
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CELERY_BROKER_URL = "redis://127.0.0.1:6379/0"
# celery时区设置建议与Django settings中TIME_ZONE同样时区防止时差
# Django设置时区需同时设置USE_TZ=True和TIME_ZONE = 'Asia/Shanghai'
CELERY_TIMEZONE = TIME_ZONE
# 为django_celery_results存储Celery任务执行结果设置后台
# 格式为db+scheme://user:password@host:port/dbname
# 支持数据库django-db和缓存django-cache存储任务状态及结果
CELERY_RESULT_BACKEND = "django-db"
# celery内容等消息的格式设置默认json
CELERY_ACCEPT_CONTENT = ['application/json', ]
CELERY_TASK_SERIALIZER = 'json'
CELERY_RESULT_SERIALIZER = 'json'
# 为任务设置超时时间,单位秒。超时即中止,执行下个任务。
CELERY_TASK_TIME_LIMIT = 5
# 为存储结果设置过期日期默认1天过期。如果beat开启Celery每天会自动清除。
# 设为0存储结果永不过期
CELERY_RESULT_EXPIRES = 1
# 任务限流
CELERY_TASK_ANNOTATIONS = {'tasks.add': {'rate_limit': '10/s'}}
# Worker并发数量一般默认CPU核数可以不设置
CELERY_WORKER_CONCURRENCY = 2
# 每个worker执行了多少任务就会死掉默认是无限的
CELERY_WORKER_MAX_TASKS_PER_CHILD = 200

View File

@ -1,28 +0,0 @@
"""
URL configuration for KeywordRetriever project.
The `urlpatterns` list routes URLs to views. For more information please see:
https://docs.djangoproject.com/en/4.2/topics/http/urls/
Examples:
Function views
1. Add an import: from my_app import views
2. Add a URL to urlpatterns: path('', views.home, name='home')
Class-based views
1. Add an import: from other_app.views import Home
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
Including another URLconf
1. Import the include() function: from django.urls import include, path
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
"""
from django.contrib import admin
from django.urls import path, include
import django_unicorn
urlpatterns = [
path('admin/', admin.site.urls),
path("unicorn/", include("django_unicorn.urls")),
path('', include('retriever.urls')),
]
admin.site.site_header = '敏感词维护'

View File

@ -1,16 +0,0 @@
"""
WSGI config for KeywordRetriever project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
application = get_wsgi_application()

View File

@ -1,22 +0,0 @@
#!/usr/bin/env python
"""Django's command-line utility for administrative tasks."""
import os
import sys
def main():
"""Run administrative tasks."""
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)
if __name__ == '__main__':
main()

Binary file not shown.

View File

View File

@ -1,15 +0,0 @@
from django.contrib import admin
from django.contrib.auth.models import Group, User
from django_celery_results.models import TaskResult, GroupResult
# Register your models here.
from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords
admin.site.register(Keywords)
# hide the user and group models
admin.site.unregister(Group)
admin.site.unregister(User)
admin.site.unregister(TaskResult)
admin.site.unregister(GroupResult)

View File

@ -1,6 +0,0 @@
from django.apps import AppConfig
class RetrieverConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'retriever'

View File

@ -1,20 +0,0 @@
from django.utils.timezone import now
from django_unicorn.components import UnicornView
from retriever.models import RetrieverTask
class ParagraphRenderView(UnicornView):
current_task = None
def mount(self):
print(self.component_args)
self.arg = self.component_args[0]
# self.current_task = RetrieverTask.objects.get(task_uuid=arg)
def get_paragraphs(self):
self.current_task = RetrieverTask.objects.get(task_uuid=self.arg)
print(self.current_task)
return RetrieverTask.objects.get(task_uuid=self.arg)

View File

@ -1,15 +0,0 @@
from django import forms
from multiupload.fields import MultiFileField
from retriever.models import Keywords
class SpaceSeparatedField(forms.CharField):
def to_python(self, value):
if not value:
return []
return value.split(' ')
class UploadForm(forms.Form):
attachments = MultiFileField(min_num=1, max_num=10, max_file_size=1024 * 1024 * 64,
attrs={'class': 'file-input is-primary', 'accept': '.docx, .doc, .dot, .pptx, .ppt, .pdf, .xls, .xlsx, .txt', 'id': 'file-input'})

View File

@ -1,61 +0,0 @@
# Generated by Django 4.2.4 on 2023-08-25 14:14
import django.core.validators
from django.db import migrations, models
import django.db.models.deletion
import uuid
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Keywords',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(max_length=64, unique=True, verbose_name='敏感词')),
('is_active', models.BooleanField(default=True, verbose_name='是否启用')),
('keyword_created', models.DateTimeField(auto_now_add=True)),
],
options={
'verbose_name': '敏感词',
'verbose_name_plural': '敏感词',
},
),
migrations.CreateModel(
name='RetrieverTask',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('task_uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)),
('task_keywords', models.CharField(max_length=1024)),
('task_status', models.BooleanField(default=False)),
('task_started', models.BooleanField(default=False)),
('task_created', models.DateTimeField(auto_now_add=True)),
],
),
migrations.CreateModel(
name='UploadFile',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('file_id', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)),
('file_name', models.CharField(max_length=100)),
('is_checked', models.BooleanField(default=False)),
('file', models.FileField(upload_to='uploads/', validators=[django.core.validators.FileExtensionValidator(allowed_extensions=['docx'])])),
('related_task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='attachment', to='retriever.retrievertask')),
],
),
migrations.CreateModel(
name='KeywordParagraph',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('keyword', models.CharField(max_length=1024)),
('paragraph', models.TextField()),
('related_file', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='keyword_paragraph', to='retriever.uploadfile')),
],
),
]

View File

@ -1,70 +0,0 @@
import uuid
from django.core.validators import FileExtensionValidator
from django.db import models
import time
# Create your models here.
class RetrieverTask(models.Model):
task_uuid = models.UUIDField(default=uuid.uuid4, unique=True, editable=False)
task_keywords = models.CharField(max_length=1024, blank=False, null=False)
task_status = models.BooleanField(default=False)
task_started = models.BooleanField(default=False) # 任务是否已经开始,避免重复执行
task_created = models.DateTimeField(auto_now_add=True)
def __str__(self):
return self.task_uuid.__str__()
class UploadFile(models.Model):
file_id = models.UUIDField(default=uuid.uuid4, unique=True, editable=False)
related_task = models.ForeignKey('RetrieverTask', related_name='attachment', on_delete=models.CASCADE)
file_name = models.CharField(max_length=100)
is_checked = models.BooleanField(default=False)
file = models.FileField(upload_to='uploads/', validators=[FileExtensionValidator(allowed_extensions=['docx'])])
def __str__(self):
return self.file_name
@property
def file_path(self):
return self.file.path
@property
def file_keyword_str(self):
# 以列表形式返回文件所属的所有段落的keyword
keyword_in_paragraph = self.keyword_paragraph.all().values_list('keyword', flat=True)
flat_keyword = [item.strip("[]' ") for sublist in keyword_in_paragraph for item in sublist.split(',')]
# print(flat_keyword)
# 排除重复的keyword 并返回以逗号分隔的字符串
return ''.join(set(flat_keyword))
# return self.keyword_paragraph.all().values_list('keyword', flat=True)
class KeywordParagraph(models.Model):
related_file = models.ForeignKey('UploadFile', related_name='keyword_paragraph', on_delete=models.CASCADE)
keyword = models.CharField(max_length=1024)
paragraph = models.TextField()
def __str__(self):
return self.keyword
class Keywords(models.Model):
keyword = models.CharField(max_length=64, unique=True, blank=False, null=False, verbose_name='敏感词')
is_active = models.BooleanField(default=True, verbose_name='是否启用')
keyword_created = models.DateTimeField(auto_now_add=True)
def __str__(self):
return self.keyword
@property
def active_keyword_list(self):
# 以列表形式返回所有is_active=True的keyword
return Keywords.objects.filter(is_active=True).values_list('keyword', flat=True)
class Meta:
verbose_name = '敏感词'
verbose_name_plural = '敏感词'

View File

@ -1,39 +0,0 @@
# app/tasks.py, 可以复用的task 改这个文件记得重启 Celery
import ast
import time
from celery import shared_task
from .models import RetrieverTask, UploadFile, KeywordParagraph
from .tools.keyword_find import util_keyword_find
@shared_task
def start_retriever_job(task_id):
current_task = RetrieverTask.objects.get(task_uuid=task_id)
task_keywords = ast.literal_eval(current_task.task_keywords) # 将字符串转换为list
# print(task_keywords)
for each in current_task.attachment.all():
if not each.is_checked:
try:
result_dict = util_keyword_find(each.file_path, task_keywords)
UploadFile.objects.filter(file_id=each.file_id).update(is_checked=True) # 更新is_checked字段
except Exception as e:
print(e)
result_dict = {'file_name': each.file_name, 'find_list': ['该文件检查程序出错,请联系管理员'], 'paragraph_keyword': ['出错']}
KeywordParagraph.objects.bulk_create(
[KeywordParagraph(related_file=each, keyword=para_keyword, paragraph=paragraph) for paragraph, para_keyword in
zip(result_dict['find_list'], result_dict['paragraph_keyword'])]) # 批量创建KeywordParagraph对象
elif each.is_checked:
continue
RetrieverTask.objects.filter(task_uuid=task_id).update(task_status=True)
time.sleep(600)
# delete task and related files
for each in current_task.attachment.all():
each.file.delete(save=True)
RetrieverTask.objects.filter(task_uuid=task_id).delete()
return task_id

View File

@ -1,76 +0,0 @@
{% extends 'base.html' %}
{% load unicorn %}
{% block navbar_item %}
<a href="/" class="button is-primary">
返回首页
</a>
{% endblock %}
{% block content %}
<div class="container">
<div class="content">
{# <h5 class="title is-6">查找关键词: <span class="tag is-danger is-light is-medium">{{ current_task.task_keywords }}</span></h5>#}
<div>
{% if not current_task.task_status %}
<b>搜索状态: <span class="tag is-danger is-light is-medium">正在检索</span></b>
<progress class="progress is-small is-primary" max="100">15%</progress>
{% endif %}
{% if current_task.task_status %}
<b>搜索状态: <span class="tag is-success is-light is-medium">已完成</span></b>
{% endif %}
<br>
</div>
{% for task_file in current_task.attachment.all %}
<div class="block">
<span class="tag is-info is-medium">
{{ task_file.file_name }}
</span>
{% if task_file.file_keyword_str %}
<b>检索到的敏感词:{{ task_file.file_keyword_str }}</b>
<br>
{% for paragraph in task_file.keyword_paragraph.all %}
<p><b>{{ forloop.counter }}.</b>{{ paragraph.paragraph | safe }}</p>
{% endfor %}
{% endif %}
{% if not task_file.file_keyword_str %}
<b>未检索到敏感词</b>
{% endif %}
</div>
{% endfor %}
</div>
</div>
{% endblock %}
{#{% block content %}#}
{# <div>#}
{# {% unicorn 'paragraph_render' current_task.task_uuid %}#}
{# </div>#}
{##}
{#{% endblock %}#}
{% block script %}
function monitorAndRefresh() {
// 获取要监控的值
const targetElement = "{{ current_task.task_status }}"; // 请将 "yourTargetElementId" 替换为实际的标签ID
if (targetElement !== "True") {
// 如果标签的值不是true等待两秒后刷新页面
setTimeout(function () {
location.reload();
}, 1000); // 1000毫秒1秒后刷新页面
}
}
// 调用监控函数
monitorAndRefresh();
{% endblock %}

View File

@ -1,6 +0,0 @@
<div>
<!-- put component code here -->
<div unicorn:poll-1000="get_paragraphs">{{ current_task.task_status }}</div>
</div>

View File

@ -1,3 +0,0 @@
from django.test import TestCase
# Create your tests here.

View File

@ -1,222 +0,0 @@
import docx2txt
from pptx import Presentation
import re
import PyPDF2
import pandas as pd
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
"""
插入html标签高亮敏感词
:param paragraph: 待处理的段落
:param keyword_list: 列表形式的敏感词
:return: 标记后的段落
"""
marked_paragraph = paragraph
for keyword in keyword_list:
marked_paragraph = marked_paragraph.replace(keyword,
'<span style="background-color:yellow">' + keyword + '</span>')
return marked_paragraph
def docx_find(file_path: str, keyword_list: list) -> dict:
"""
docx 文件查找
查找范围所有内容 包含所有段落页眉页脚表格文本框等
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# doc_file = Document(file_path)
doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件
find_dict = {'file_name': file_path, 'find_list': []}
find_list = []
paragraph_keyword = []
for para in doc_text.split('\n'):
para = re.sub(r'\s+', '', para) # 删除段落中空格
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
# 若该段落中有敏感词则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
# print(len(find_list))
find_dict['find_list'] = find_list
find_dict['paragraph_keyword'] = paragraph_keyword
return find_dict
def pptx_find(file_path: str, keyword_list: list) -> dict:
"""
ppt 文件查找
查找范围所有页的所有文本框
:param file_path:
:param keyword_list:
:return:
"""
prs = Presentation(file_path) # 打开PPT文件
text_content = ""
find_list = []
paragraph_keyword = []
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
for run in paragraph.runs:
text_content += run.text + "\n"
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def pdf_find(file_path: str, keyword_list: list) -> dict:
"""
pdf 文件查找
查找范围所有页的所有段落分页处敏感词可能会被分割无法查找到
:param file_path:
:param keyword_list:
:return:
"""
find_list = []
paragraph_keyword = []
with open(file_path, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
text_content = ""
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
page_text = page.extract_text().replace("\n", "").replace(" ", "")
text_content += page_text + "\n"
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def excel_find(file_path: str, keyword_list: list) -> dict:
"""
excel 文件查找
查找范围所有工作表的所有单元格页眉页脚文本框无法查找
:param file_path:
:param keyword_list:
:return:
"""
find_list = [] # 存储包含关键字的单元格位置
paragraph_keyword = [] # 存储包含关键字的单元格位置
# 读取Excel文件的所有工作表
xls = pd.ExcelFile(file_path)
for sheet_name in xls.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
# 遍历每个工作表的每个单元格
for index, row in df.iterrows():
for col_name, cell_value in row.items():
if isinstance(cell_value, str):
this_para_keyword = [keyword for keyword in keyword_list if keyword in cell_value] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(f"{sheet_name}, {index + 1}, {col_name}, {cell_value}")
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def txt_find(file_path: str, keyword_list: list) -> dict:
"""
txt 文件查找
查找范围所有段落
:param file_path:
:param keyword_list:
:return:
"""
find_list = []
paragraph_keyword = []
with open(file_path, 'r') as txt_file:
text_content = txt_file.read()
for para in text_content.split('\n'):
para = re.sub(r'\s+', '', para)
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
if this_para_keyword:
paragraph_keyword.append(this_para_keyword)
find_list.append(para)
find_dict = {'file_name': file_path, 'find_list': find_list, 'paragraph_keyword': paragraph_keyword}
return find_dict
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
"""
对指定单一文件进行敏感词查找
:param file_path: 文件路径
:param keyword_list: 列表形式的敏感词
:return: 返回一个字典包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
"""
# check file type
if file_path.endswith(('.docx', '.doc', '.dot')):
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
# print(find_dict)
return find_dict
elif file_path.endswith(('.pptx', '.ppt', '.pot')):
find_dict = pptx_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith(('.xlsx', '.xls', '.xlsm', '.xlsb', '.xltm', '.xltx')):
find_dict = excel_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith('.pdf'):
find_dict = pdf_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
elif file_path.endswith('.txt'):
find_dict = txt_find(file_path, keyword_list)
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
marked_para = insert_html_tag(para, keyword_list)
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
return find_dict
else:
return {'file_name': file_path, 'find_list': []}
if __name__ == '__main__':
# print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
print(util_keyword_find('浅水.xlsx', ['打捞', '声呐', '浅水']))

View File

@ -1,7 +0,0 @@
from django.urls import path, include
from . import views
urlpatterns = [
path('', views.KeywordRetrieverView.as_view(), name='index'),
path('<uuid:task_uuid>', views.TaskViewerView.as_view(), name='task_viewer')
]

View File

@ -1,47 +0,0 @@
from django.shortcuts import render
from django.urls import reverse_lazy
from django.views import View
from django.views.generic import FormView
from retriever.forms import UploadForm
from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords
from .tasks import start_retriever_job
# Create your views here.
class KeywordRetrieverView(FormView):
form_class = UploadForm
template_name = "index.html" # Replace with your template.
# def get(self, request, *args, **kwargs):
# form = FileFieldForm()
# return render(request, 'index.html', {'form': form})
def form_valid(self, form):
# 创建任务
keyword_list = list(Keywords.objects.filter(is_active=True).values_list('keyword', flat=True))
r_task = RetrieverTask.objects.create(task_keywords=keyword_list)
self.task_uuid = r_task.task_uuid
# print(r_task.task_keywords)
# 将上传的文件保存到数据库
for each in form.cleaned_data['attachments']:
UploadFile.objects.create(related_task=r_task, file_name=each.name, file=each)
return super(KeywordRetrieverView, self).form_valid(form)
def get_success_url(self):
# 重写success_url跳转到任务查看页面
return reverse_lazy('task_viewer', kwargs={'task_uuid': self.task_uuid})
class TaskViewerView(View):
def get(self, requests, task_uuid):
# print(self.kwargs['task_uuid'])
current_task = RetrieverTask.objects.get(task_uuid=task_uuid) # 获取当前任务
if not current_task.task_started:
# 如果任务未开始,则调用异步任务
current_task.task_started = True
current_task.save()
start_retriever_job.delay(current_task.task_uuid) # 调用异步任务
return render(requests, 'task_viewer.html', {'current_task': current_task})

11851
static/bulma.css vendored

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,104 +0,0 @@
{% load static %}
{% load unicorn %}
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
<meta name="author" content="Aldi Duzha">
<meta name="description" content="Search houses and apartments for rent anywhere within the US. View floorplans, pricing, images and more. Find your perfect rental.">
<meta name="keywords" content="bulma, rent, template, apartments, page, website, free, awesome">
<title>寻章智搜</title>
<link rel="stylesheet" href="{% static 'fontawesomecss/css/all.min.css' %}">
<link rel="stylesheet" type="text/css" href="{% static 'bulma.min.css' %}">
{% unicorn_scripts %}
</head>
<body>
{% csrf_token %}
<div id="app">
<section class="hero is-fullheight is-light" >
<div class="hero-head">
<nav class="navbar is-transparent is-spaced" role="navigation" aria-label="main navigation">
<div class="container">
<div class="navbar-brand">
<a class="navbar-item" href="/">
{# <img src="https://bulma.io/images/bulma-logo.png" alt="Bulma Rent" width="80" height="20">#}
<span class="is-size-3 has-text-weight-semibold">寻章智搜 &bullet; </span>
<span class="is-size-3 has-text-weight-light" style="color: gray">直升机所保密办</span>
</a>
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navbarTopMain">
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
<span aria-hidden="true"></span>
</a>
</div>
<div class="navbar-menu" id="navbarTopMain">
<div class="navbar-end">
<div class="navbar-item">
{% block navbar_item %}
<a href="/admin" class="button is-primary">
敏感词库维护
</a>
{% endblock %}
</div>
</div>
</div>
</div>
</nav>
</div>
{% block content %}
{% endblock %}
</section>
<footer class="hero is-small is-light">
<div class="hero-body">
<div class="container has-text-centered">
<a href="https://raiot.me">
Made with <span class="has-text-danger"></span> by Raiot
</a>
<div class="columns m-t-10">
<div class="column">
<nav class="has-text-grey-light">
<a href="#" class="has-text-primary">About</a>
{# &bullet;#}
</nav>
</div>
</div>
<div class="b-t m-t-30 p-t-30 has-text-grey-light is-size-7">
Rent Template 2020 &copy; Aldi Duzha <br>
</div>
</div>
</div>
</footer>
</div>
<script>
document.addEventListener('DOMContentLoaded', () => {
// Get all "navbar-burger" elements
const $navbarBurgers = Array.prototype.slice.call(document.querySelectorAll('.navbar-burger'), 0);
// Check if there are any navbar burgers
if ($navbarBurgers.length > 0) {
// Add a click event on each of them
$navbarBurgers.forEach( el => {
el.addEventListener('click', () => {
// Get the target from the "data-target" attribute
const target = el.dataset.target;
const $target = document.getElementById(target);
// Toggle the "is-active" class on both the "navbar-burger" and the "navbar-menu"
el.classList.toggle('is-active');
$target.classList.toggle('is-active');
});
});
}
});
{% block script %}
{% endblock %}
</script>
</body>
</html>

View File

@ -1,97 +0,0 @@
{% extends 'base.html' %}
{% load static %}
{% load unicorn %}
{% block navbar_item %}
<a href="/admin" class="button is-primary">
敏感词库维护
</a>
{% endblock %}
{% block content %}
<div class="hero-body p-b-30 ">
<div class="container">
<h2 class="subtitle">
<span class="has-text-centered is-block">
Sensitive Word Detection Tool for Documents
</span>
</h2>
<h1 class="title">
<span class="is-size-2 has-text-centered is-block">敏感词检测工具</span>
</h1>
<div class="container">
<div class="notification">
<span class="has-text-centered is-block">可批量上传,支持 .docx, .doc, .dot, .pptx, .ppt, .pdf, .xls, .xlsx, .txt(UTF-8) 格式文件</span>
<br>
<form method="POST" enctype="multipart/form-data">
{% csrf_token %}
<div class="columns">
<div class="column has-text-centered">
<div class="file is-boxed is-centered">
<label class="file-label">
{{ form.attachments }}
<span class="file-cta">
<span class="file-icon">
<i class="fas fa-upload"></i>
</span>
<span class="file-label">
选择文件
</span>
</span>
</label>
</div>
</div>
</div>
<div class="columns">
<div class="column"></div>
<div class="is-centered column" id="file-list">
</div>
<div class="column"></div>
</div>
<br>
<div class="is-centered" style="text-align:center">
<input type="submit" value="提交" class="button is-link is-medium is-center">
</div>
</form>
</div>
</div>
</div>
</div>
{% endblock %}
{% block script %}
{# show file name after file selected#}
// 获取文件选择输入框和文件名显示容器
const fileInput = document.getElementById('file-input');
const fileListDiv = document.getElementById('file-list');
// 添加事件监听器来处理文件选择
fileInput.addEventListener('change', function() {
// 获取所选文件列表
const selectedFiles = fileInput.files;
// 创建一个用于显示文件名的字符串
let fileListText = '所选文件:<br>';
// 遍历文件列表并添加文件名到字符串
for (let i = 0; i < selectedFiles.length; i++) {
const fileName = selectedFiles[i].name;
fileListText += `${fileName}<br>`;
}
// 将文件名字符串插入到页面上的容器中
fileListDiv.innerHTML = fileListText;
});
{% endblock %}