From ef9c416edaf7dec51f00a7f984c8f20cf646b9eb Mon Sep 17 00:00:00 2001 From: raiot Date: Fri, 16 Feb 2024 16:47:01 +0800 Subject: [PATCH] prepare for offline env --- KeywordRetriever/settings.py | 2 + KeywordRetriever/urls.py | 2 + retriever/admin.py | 8 ++++ retriever/forms.py | 2 +- retriever/migrations/0001_initial.py | 61 ++++++++++++++++++++++++++++ retriever/models.py | 10 +++-- retriever/tasks.py | 16 +++++++- retriever/templates/task_viewer.html | 21 +++++++--- templates/base.html | 22 ++++++---- templates/index.html | 59 +++++++++++++++++++++++---- 10 files changed, 175 insertions(+), 28 deletions(-) create mode 100644 retriever/migrations/0001_initial.py diff --git a/KeywordRetriever/settings.py b/KeywordRetriever/settings.py index b603aa3..82945ad 100644 --- a/KeywordRetriever/settings.py +++ b/KeywordRetriever/settings.py @@ -11,6 +11,7 @@ https://docs.djangoproject.com/en/4.2/ref/settings/ """ from pathlib import Path +import os # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = Path(__file__).resolve().parent.parent @@ -123,6 +124,7 @@ STATIC_URL = 'static/' STATICFILES_DIRS = [ BASE_DIR / 'static', ] +STATIC_ROOT = os.path.join(BASE_DIR, 'static', 'static_root') # Default primary key field type # https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field diff --git a/KeywordRetriever/urls.py b/KeywordRetriever/urls.py index b004c58..8ff551e 100644 --- a/KeywordRetriever/urls.py +++ b/KeywordRetriever/urls.py @@ -24,3 +24,5 @@ urlpatterns = [ path("unicorn/", include("django_unicorn.urls")), path('', include('retriever.urls')), ] + +admin.site.site_header = '敏感词维护' \ No newline at end of file diff --git a/retriever/admin.py b/retriever/admin.py index fe8e86f..d11f365 100644 --- a/retriever/admin.py +++ b/retriever/admin.py @@ -1,7 +1,15 @@ from django.contrib import admin +from django.contrib.auth.models import Group, User +from django_celery_results.models import TaskResult, GroupResult # Register your models here. from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords admin.site.register(Keywords) + +# hide the user and group models +admin.site.unregister(Group) +admin.site.unregister(User) +admin.site.unregister(TaskResult) +admin.site.unregister(GroupResult) diff --git a/retriever/forms.py b/retriever/forms.py index 44e4a45..384d801 100644 --- a/retriever/forms.py +++ b/retriever/forms.py @@ -12,4 +12,4 @@ class SpaceSeparatedField(forms.CharField): class UploadForm(forms.Form): attachments = MultiFileField(min_num=1, max_num=10, max_file_size=1024 * 1024 * 64, - attrs={'class': 'file-input is-primary', 'accept': '.docx, .doc, .dot, .pptx, .ppt, .pdf, .xls'}) + attrs={'class': 'file-input is-primary', 'accept': '.docx, .doc, .dot, .pptx, .ppt, .pdf, .xls, .xlsx, .txt', 'id': 'file-input'}) diff --git a/retriever/migrations/0001_initial.py b/retriever/migrations/0001_initial.py new file mode 100644 index 0000000..b42ccd2 --- /dev/null +++ b/retriever/migrations/0001_initial.py @@ -0,0 +1,61 @@ +# Generated by Django 4.2.4 on 2023-08-25 14:14 + +import django.core.validators +from django.db import migrations, models +import django.db.models.deletion +import uuid + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ] + + operations = [ + migrations.CreateModel( + name='Keywords', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('keyword', models.CharField(max_length=64, unique=True, verbose_name='敏感词')), + ('is_active', models.BooleanField(default=True, verbose_name='是否启用')), + ('keyword_created', models.DateTimeField(auto_now_add=True)), + ], + options={ + 'verbose_name': '敏感词', + 'verbose_name_plural': '敏感词', + }, + ), + migrations.CreateModel( + name='RetrieverTask', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('task_uuid', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)), + ('task_keywords', models.CharField(max_length=1024)), + ('task_status', models.BooleanField(default=False)), + ('task_started', models.BooleanField(default=False)), + ('task_created', models.DateTimeField(auto_now_add=True)), + ], + ), + migrations.CreateModel( + name='UploadFile', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('file_id', models.UUIDField(default=uuid.uuid4, editable=False, unique=True)), + ('file_name', models.CharField(max_length=100)), + ('is_checked', models.BooleanField(default=False)), + ('file', models.FileField(upload_to='uploads/', validators=[django.core.validators.FileExtensionValidator(allowed_extensions=['docx'])])), + ('related_task', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='attachment', to='retriever.retrievertask')), + ], + ), + migrations.CreateModel( + name='KeywordParagraph', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('keyword', models.CharField(max_length=1024)), + ('paragraph', models.TextField()), + ('related_file', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='keyword_paragraph', to='retriever.uploadfile')), + ], + ), + ] diff --git a/retriever/models.py b/retriever/models.py index bb02bc7..9370a04 100644 --- a/retriever/models.py +++ b/retriever/models.py @@ -37,7 +37,7 @@ class UploadFile(models.Model): # 以列表形式返回文件所属的所有段落的keyword keyword_in_paragraph = self.keyword_paragraph.all().values_list('keyword', flat=True) flat_keyword = [item.strip("[]' ") for sublist in keyword_in_paragraph for item in sublist.split(',')] - print(flat_keyword) + # print(flat_keyword) # 排除重复的keyword 并返回以逗号分隔的字符串 return ','.join(set(flat_keyword)) # return self.keyword_paragraph.all().values_list('keyword', flat=True) @@ -53,8 +53,8 @@ class KeywordParagraph(models.Model): class Keywords(models.Model): - keyword = models.CharField(max_length=64) - is_active = models.BooleanField(default=True) + keyword = models.CharField(max_length=64, unique=True, blank=False, null=False, verbose_name='敏感词') + is_active = models.BooleanField(default=True, verbose_name='是否启用') keyword_created = models.DateTimeField(auto_now_add=True) def __str__(self): @@ -64,3 +64,7 @@ class Keywords(models.Model): def active_keyword_list(self): # 以列表形式返回所有is_active=True的keyword return Keywords.objects.filter(is_active=True).values_list('keyword', flat=True) + + class Meta: + verbose_name = '敏感词' + verbose_name_plural = '敏感词' diff --git a/retriever/tasks.py b/retriever/tasks.py index 1722109..44fb14f 100644 --- a/retriever/tasks.py +++ b/retriever/tasks.py @@ -1,5 +1,6 @@ # app/tasks.py, 可以复用的task 改这个文件记得重启 Celery!!! import ast +import time from celery import shared_task from .models import RetrieverTask, UploadFile, KeywordParagraph @@ -15,8 +16,13 @@ def start_retriever_job(task_id): for each in current_task.attachment.all(): if not each.is_checked: - result_dict = util_keyword_find(each.file_path, task_keywords) - UploadFile.objects.filter(file_id=each.file_id).update(is_checked=True) # 更新is_checked字段 + try: + result_dict = util_keyword_find(each.file_path, task_keywords) + UploadFile.objects.filter(file_id=each.file_id).update(is_checked=True) # 更新is_checked字段 + + except Exception as e: + print(e) + result_dict = {'file_name': each.file_name, 'find_list': ['该文件检查程序出错,请联系管理员'], 'paragraph_keyword': ['出错']} KeywordParagraph.objects.bulk_create( [KeywordParagraph(related_file=each, keyword=para_keyword, paragraph=paragraph) for paragraph, para_keyword in @@ -24,4 +30,10 @@ def start_retriever_job(task_id): elif each.is_checked: continue RetrieverTask.objects.filter(task_uuid=task_id).update(task_status=True) + + time.sleep(600) + # delete task and related files + for each in current_task.attachment.all(): + each.file.delete(save=True) + RetrieverTask.objects.filter(task_uuid=task_id).delete() return task_id diff --git a/retriever/templates/task_viewer.html b/retriever/templates/task_viewer.html index 315ec47..de4bc54 100644 --- a/retriever/templates/task_viewer.html +++ b/retriever/templates/task_viewer.html @@ -1,6 +1,10 @@ {% extends 'base.html' %} {% load unicorn %} - +{% block navbar_item %} + + 返回首页 + +{% endblock %} {% block content %}
@@ -22,11 +26,16 @@ {{ task_file.file_name }} - 检索到的敏感词:{{ task_file.file_keyword_str }} -
- {% for paragraph in task_file.keyword_paragraph.all %} -

{{ forloop.counter }}.{{ paragraph.paragraph | safe }}

- {% endfor %} + {% if task_file.file_keyword_str %} + 检索到的敏感词:{{ task_file.file_keyword_str }} +
+ {% for paragraph in task_file.keyword_paragraph.all %} +

{{ forloop.counter }}.{{ paragraph.paragraph | safe }}

+ {% endfor %} + {% endif %} + {% if not task_file.file_keyword_str %} + 未检索到敏感词 + {% endif %}
diff --git a/templates/base.html b/templates/base.html index 03d16b3..fa31961 100644 --- a/templates/base.html +++ b/templates/base.html @@ -11,7 +11,7 @@ 寻章智搜 - + {% unicorn_scripts %} @@ -25,8 +25,12 @@ @@ -62,13 +66,13 @@
- Rent Template 2020 © Aldi Duzha
Terms of use and Privacy policy. - Fair Housing + Rent Template 2020 © Aldi Duzha
diff --git a/templates/index.html b/templates/index.html index f79bdbd..030181c 100644 --- a/templates/index.html +++ b/templates/index.html @@ -2,27 +2,32 @@ {% load static %} {% load unicorn %} - +{% block navbar_item %} + + 敏感词库维护 + +{% endblock %} {% block content %}

- Search hundreds of thousands of apartments, condos and houses for rent. + Sensitive Word Detection Tool for Documents

- 文件敏感词检测工具 + 敏感词检测工具

- 批量上传 + 可批量上传,支持 .docx, .doc, .dot, .pptx, .ppt, .pdf, .xls, .xlsx, .txt(UTF-8) 格式文件 +
{% csrf_token %}
-
+
+
+
+
+ + +
+
+
+
- +
+ +
+
@@ -49,4 +66,32 @@
+{% endblock %} + +{% block script %} + +{# show file name after file selected#} + + // 获取文件选择输入框和文件名显示容器 + const fileInput = document.getElementById('file-input'); + const fileListDiv = document.getElementById('file-list'); + + // 添加事件监听器来处理文件选择 + fileInput.addEventListener('change', function() { + // 获取所选文件列表 + const selectedFiles = fileInput.files; + + // 创建一个用于显示文件名的字符串 + let fileListText = '所选文件:
'; + + // 遍历文件列表并添加文件名到字符串 + for (let i = 0; i < selectedFiles.length; i++) { + const fileName = selectedFiles[i].name; + fileListText += `${fileName}
`; + } + + // 将文件名字符串插入到页面上的容器中 + fileListDiv.innerHTML = fileListText; + }); + {% endblock %} \ No newline at end of file