Compare commits
No commits in common. "main" and "e6baff66813e11b2f0533df9329d70dafa0deb1b" have entirely different histories.
main
...
e6baff6681
|
@ -0,0 +1,3 @@
|
|||
from .celery import app as celery_app
|
||||
|
||||
__all__ = ('celery_app',)
|
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
ASGI config for KeywordRetriever project.
|
||||
|
||||
It exposes the ASGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/4.2/howto/deployment/asgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.asgi import get_asgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
|
||||
|
||||
application = get_asgi_application()
|
|
@ -0,0 +1,20 @@
|
|||
import os
|
||||
from celery import Celery
|
||||
|
||||
# 设置环境变量
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
|
||||
|
||||
# 实例化
|
||||
app = Celery('KeywordRetriever')
|
||||
|
||||
# namespace='CELERY'作用是允许你在Django配置文件中对Celery进行配置
|
||||
# 但所有Celery配置项必须以CELERY开头,防止冲突
|
||||
app.config_from_object('django.conf:settings', namespace='CELERY')
|
||||
# 自动从Django的已注册app中发现任务
|
||||
app.autodiscover_tasks()
|
||||
|
||||
|
||||
# # 一个测试任务
|
||||
# @app.task(bind=True)
|
||||
# def debug_task(self):
|
||||
# print(f'Request: {self.request!r}')
|
|
@ -0,0 +1,161 @@
|
|||
"""
|
||||
Django settings for KeywordRetriever project.
|
||||
|
||||
Generated by 'django-admin startproject' using Django 4.2.4.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/4.2/topics/settings/
|
||||
|
||||
For the full list of settings and their values, see
|
||||
https://docs.djangoproject.com/en/4.2/ref/settings/
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
# Build paths inside the project like this: BASE_DIR / 'subdir'.
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||
|
||||
|
||||
# Quick-start development settings - unsuitable for production
|
||||
# See https://docs.djangoproject.com/en/4.2/howto/deployment/checklist/
|
||||
|
||||
# SECURITY WARNING: keep the secret key used in production secret!
|
||||
SECRET_KEY = 'django-insecure-&oj4h^8820e14zb_#p(-k8_pcd96y!hng&mre)*1i)owkk1+&d'
|
||||
|
||||
# SECURITY WARNING: don't run with debug turned on in production!
|
||||
DEBUG = True
|
||||
|
||||
ALLOWED_HOSTS = []
|
||||
|
||||
|
||||
# Application definition
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.admin',
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'django_unicorn',
|
||||
'django_celery_results',
|
||||
'retriever',
|
||||
]
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'django.middleware.clickjacking.XFrameOptionsMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'KeywordRetriever.urls'
|
||||
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [BASE_DIR / 'templates']
|
||||
,
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'KeywordRetriever.wsgi.application'
|
||||
|
||||
|
||||
# Database
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#databases
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': BASE_DIR / 'db.sqlite3',
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
# Password validation
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#auth-password-validators
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator',
|
||||
},
|
||||
{
|
||||
'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator',
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# Internationalization
|
||||
# https://docs.djangoproject.com/en/4.2/topics/i18n/
|
||||
|
||||
LANGUAGE_CODE = 'zh-Hans'
|
||||
|
||||
TIME_ZONE = 'Asia/Shanghai'
|
||||
|
||||
USE_I18N = True
|
||||
|
||||
USE_TZ = True
|
||||
|
||||
|
||||
# Static files (CSS, JavaScript, Images)
|
||||
# https://docs.djangoproject.com/en/4.2/howto/static-files/
|
||||
|
||||
STATIC_URL = 'static/'
|
||||
STATICFILES_DIRS = [
|
||||
BASE_DIR / 'static',
|
||||
]
|
||||
|
||||
# Default primary key field type
|
||||
# https://docs.djangoproject.com/en/4.2/ref/settings/#default-auto-field
|
||||
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
CELERY_BROKER_URL = "redis://127.0.0.1:6379/0"
|
||||
|
||||
# celery时区设置,建议与Django settings中TIME_ZONE同样时区,防止时差
|
||||
# Django设置时区需同时设置USE_TZ=True和TIME_ZONE = 'Asia/Shanghai'
|
||||
CELERY_TIMEZONE = TIME_ZONE
|
||||
|
||||
# 为django_celery_results存储Celery任务执行结果设置后台
|
||||
# 格式为:db+scheme://user:password@host:port/dbname
|
||||
# 支持数据库django-db和缓存django-cache存储任务状态及结果
|
||||
CELERY_RESULT_BACKEND = "django-db"
|
||||
# celery内容等消息的格式设置,默认json
|
||||
CELERY_ACCEPT_CONTENT = ['application/json', ]
|
||||
CELERY_TASK_SERIALIZER = 'json'
|
||||
CELERY_RESULT_SERIALIZER = 'json'
|
||||
|
||||
# 为任务设置超时时间,单位秒。超时即中止,执行下个任务。
|
||||
CELERY_TASK_TIME_LIMIT = 5
|
||||
|
||||
# 为存储结果设置过期日期,默认1天过期。如果beat开启,Celery每天会自动清除。
|
||||
# 设为0,存储结果永不过期
|
||||
CELERY_RESULT_EXPIRES = 1
|
||||
|
||||
# 任务限流
|
||||
CELERY_TASK_ANNOTATIONS = {'tasks.add': {'rate_limit': '10/s'}}
|
||||
|
||||
# Worker并发数量,一般默认CPU核数,可以不设置
|
||||
CELERY_WORKER_CONCURRENCY = 2
|
||||
|
||||
# 每个worker执行了多少任务就会死掉,默认是无限的
|
||||
CELERY_WORKER_MAX_TASKS_PER_CHILD = 200
|
|
@ -0,0 +1,26 @@
|
|||
"""
|
||||
URL configuration for KeywordRetriever project.
|
||||
|
||||
The `urlpatterns` list routes URLs to views. For more information please see:
|
||||
https://docs.djangoproject.com/en/4.2/topics/http/urls/
|
||||
Examples:
|
||||
Function views
|
||||
1. Add an import: from my_app import views
|
||||
2. Add a URL to urlpatterns: path('', views.home, name='home')
|
||||
Class-based views
|
||||
1. Add an import: from other_app.views import Home
|
||||
2. Add a URL to urlpatterns: path('', Home.as_view(), name='home')
|
||||
Including another URLconf
|
||||
1. Import the include() function: from django.urls import include, path
|
||||
2. Add a URL to urlpatterns: path('blog/', include('blog.urls'))
|
||||
"""
|
||||
from django.contrib import admin
|
||||
from django.urls import path, include
|
||||
import django_unicorn
|
||||
|
||||
|
||||
urlpatterns = [
|
||||
path('admin/', admin.site.urls),
|
||||
path("unicorn/", include("django_unicorn.urls")),
|
||||
path('', include('retriever.urls')),
|
||||
]
|
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
WSGI config for KeywordRetriever project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/4.2/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
|
||||
|
||||
application = get_wsgi_application()
|
|
@ -0,0 +1,22 @@
|
|||
#!/usr/bin/env python
|
||||
"""Django's command-line utility for administrative tasks."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def main():
|
||||
"""Run administrative tasks."""
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'KeywordRetriever.settings')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -0,0 +1,7 @@
|
|||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
||||
|
||||
from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords
|
||||
|
||||
admin.site.register(Keywords)
|
|
@ -0,0 +1,6 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class RetrieverConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
name = 'retriever'
|
|
@ -0,0 +1,20 @@
|
|||
from django.utils.timezone import now
|
||||
from django_unicorn.components import UnicornView
|
||||
|
||||
from retriever.models import RetrieverTask
|
||||
|
||||
|
||||
class ParagraphRenderView(UnicornView):
|
||||
current_task = None
|
||||
|
||||
def mount(self):
|
||||
print(self.component_args)
|
||||
self.arg = self.component_args[0]
|
||||
# self.current_task = RetrieverTask.objects.get(task_uuid=arg)
|
||||
|
||||
def get_paragraphs(self):
|
||||
self.current_task = RetrieverTask.objects.get(task_uuid=self.arg)
|
||||
print(self.current_task)
|
||||
return RetrieverTask.objects.get(task_uuid=self.arg)
|
||||
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
from django import forms
|
||||
from multiupload.fields import MultiFileField
|
||||
from retriever.models import Keywords
|
||||
|
||||
|
||||
class SpaceSeparatedField(forms.CharField):
|
||||
def to_python(self, value):
|
||||
if not value:
|
||||
return []
|
||||
return value.split(' ')
|
||||
|
||||
|
||||
class UploadForm(forms.Form):
|
||||
attachments = MultiFileField(min_num=1, max_num=10, max_file_size=1024 * 1024 * 64,
|
||||
attrs={'class': 'file-input is-primary'})
|
|
@ -0,0 +1,66 @@
|
|||
import uuid
|
||||
|
||||
from django.core.validators import FileExtensionValidator
|
||||
from django.db import models
|
||||
import time
|
||||
|
||||
|
||||
# Create your models here.
|
||||
|
||||
class RetrieverTask(models.Model):
|
||||
task_uuid = models.UUIDField(default=uuid.uuid4, unique=True, editable=False)
|
||||
task_keywords = models.CharField(max_length=1024, blank=False, null=False)
|
||||
task_status = models.BooleanField(default=False)
|
||||
task_started = models.BooleanField(default=False) # 任务是否已经开始,避免重复执行
|
||||
task_created = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.task_uuid.__str__()
|
||||
|
||||
|
||||
class UploadFile(models.Model):
|
||||
file_id = models.UUIDField(default=uuid.uuid4, unique=True, editable=False)
|
||||
related_task = models.ForeignKey('RetrieverTask', related_name='attachment', on_delete=models.CASCADE)
|
||||
file_name = models.CharField(max_length=100)
|
||||
is_checked = models.BooleanField(default=False)
|
||||
file = models.FileField(upload_to='uploads/', validators=[FileExtensionValidator(allowed_extensions=['docx'])])
|
||||
|
||||
def __str__(self):
|
||||
return self.file_name
|
||||
|
||||
@property
|
||||
def file_path(self):
|
||||
return self.file.path
|
||||
|
||||
@property
|
||||
def file_keyword_str(self):
|
||||
# 以列表形式返回文件所属的所有段落的keyword
|
||||
keyword_in_paragraph = self.keyword_paragraph.all().values_list('keyword', flat=True)
|
||||
flat_keyword = [item.strip("[]' ") for sublist in keyword_in_paragraph for item in sublist.split(',')]
|
||||
print(flat_keyword)
|
||||
# 排除重复的keyword 并返回以逗号分隔的字符串
|
||||
return ','.join(set(flat_keyword))
|
||||
# return self.keyword_paragraph.all().values_list('keyword', flat=True)
|
||||
|
||||
|
||||
class KeywordParagraph(models.Model):
|
||||
related_file = models.ForeignKey('UploadFile', related_name='keyword_paragraph', on_delete=models.CASCADE)
|
||||
keyword = models.CharField(max_length=1024)
|
||||
paragraph = models.TextField()
|
||||
|
||||
def __str__(self):
|
||||
return self.keyword
|
||||
|
||||
|
||||
class Keywords(models.Model):
|
||||
keyword = models.CharField(max_length=64)
|
||||
is_active = models.BooleanField(default=True)
|
||||
keyword_created = models.DateTimeField(auto_now_add=True)
|
||||
|
||||
def __str__(self):
|
||||
return self.keyword
|
||||
|
||||
@property
|
||||
def active_keyword_list(self):
|
||||
# 以列表形式返回所有is_active=True的keyword
|
||||
return Keywords.objects.filter(is_active=True).values_list('keyword', flat=True)
|
|
@ -0,0 +1,27 @@
|
|||
# app/tasks.py, 可以复用的task 改这个文件记得重启 Celery!!!
|
||||
import ast
|
||||
from celery import shared_task
|
||||
|
||||
from .models import RetrieverTask, UploadFile, KeywordParagraph
|
||||
|
||||
from .tools.keyword_find import util_keyword_find
|
||||
|
||||
|
||||
@shared_task
|
||||
def start_retriever_job(task_id):
|
||||
current_task = RetrieverTask.objects.get(task_uuid=task_id)
|
||||
task_keywords = ast.literal_eval(current_task.task_keywords) # 将字符串转换为list
|
||||
# print(task_keywords)
|
||||
|
||||
for each in current_task.attachment.all():
|
||||
if not each.is_checked:
|
||||
result_dict = util_keyword_find(each.file_path, task_keywords)
|
||||
UploadFile.objects.filter(file_id=each.file_id).update(is_checked=True) # 更新is_checked字段
|
||||
|
||||
KeywordParagraph.objects.bulk_create(
|
||||
[KeywordParagraph(related_file=each, keyword=para_keyword, paragraph=paragraph) for paragraph, para_keyword in
|
||||
zip(result_dict['find_list'], result_dict['paragraph_keyword'])]) # 批量创建KeywordParagraph对象
|
||||
elif each.is_checked:
|
||||
continue
|
||||
RetrieverTask.objects.filter(task_uuid=task_id).update(task_status=True)
|
||||
return task_id
|
|
@ -0,0 +1,67 @@
|
|||
{% extends 'base.html' %}
|
||||
{% load unicorn %}
|
||||
|
||||
{% block content %}
|
||||
<div class="container">
|
||||
<div class="content">
|
||||
{# <h5 class="title is-6">查找关键词: <span class="tag is-danger is-light is-medium">{{ current_task.task_keywords }}</span></h5>#}
|
||||
<div>
|
||||
{% if not current_task.task_status %}
|
||||
<b>搜索状态: <span class="tag is-danger is-light is-medium">正在检索</span></b>
|
||||
<progress class="progress is-small is-primary" max="100">15%</progress>
|
||||
|
||||
{% endif %}
|
||||
{% if current_task.task_status %}
|
||||
<b>搜索状态: <span class="tag is-success is-light is-medium">已完成</span></b>
|
||||
{% endif %}
|
||||
<br>
|
||||
</div>
|
||||
{% for task_file in current_task.attachment.all %}
|
||||
|
||||
<div class="block">
|
||||
<span class="tag is-info is-medium">
|
||||
{{ task_file.file_name }}
|
||||
</span>
|
||||
<b>检索到的敏感词:{{ task_file.file_keyword_str }}</b>
|
||||
<br>
|
||||
{% for paragraph in task_file.keyword_paragraph.all %}
|
||||
<p>{{ forloop.counter }}.{{ paragraph.paragraph | safe }}</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% endblock %}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
{#{% block content %}#}
|
||||
{# <div>#}
|
||||
{# {% unicorn 'paragraph_render' current_task.task_uuid %}#}
|
||||
{# </div>#}
|
||||
{##}
|
||||
{#{% endblock %}#}
|
||||
|
||||
|
||||
{% block script %}
|
||||
function monitorAndRefresh() {
|
||||
// 获取要监控的值
|
||||
const targetElement = "{{ current_task.task_status }}"; // 请将 "yourTargetElementId" 替换为实际的标签ID
|
||||
|
||||
if (targetElement !== "True") {
|
||||
// 如果标签的值不是true,等待两秒后刷新页面
|
||||
setTimeout(function () {
|
||||
location.reload();
|
||||
}, 1000); // 1000毫秒(1秒)后刷新页面
|
||||
}
|
||||
}
|
||||
|
||||
// 调用监控函数
|
||||
monitorAndRefresh();
|
||||
|
||||
{% endblock %}
|
|
@ -0,0 +1,6 @@
|
|||
<div>
|
||||
<!-- put component code here -->
|
||||
<div unicorn:poll-1000="get_paragraphs">{{ current_task.task_status }}</div>
|
||||
</div>
|
||||
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
|
@ -0,0 +1,99 @@
|
|||
# from docx import Document
|
||||
import docx2txt
|
||||
import re
|
||||
|
||||
|
||||
def insert_html_tag(paragraph: str, keyword_list: list) -> str:
|
||||
"""
|
||||
插入html标签高亮敏感词
|
||||
:param paragraph: 待处理的段落
|
||||
:param keyword_list: 列表形式的敏感词
|
||||
:return: 标记后的段落
|
||||
"""
|
||||
marked_paragraph = paragraph
|
||||
for keyword in keyword_list:
|
||||
marked_paragraph = marked_paragraph.replace(keyword,
|
||||
'<span style="background-color:yellow">' + keyword + '</span>')
|
||||
return marked_paragraph
|
||||
|
||||
|
||||
def docx_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
:param file_path: 文件路径
|
||||
:param keyword_list: 列表形式的敏感词
|
||||
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
|
||||
"""
|
||||
# doc_file = Document(file_path)
|
||||
doc_text = docx2txt.process(file_path) # 使用docx2txt模块读取docx文件
|
||||
find_dict = {'file_name': file_path, 'find_list': []}
|
||||
find_list = []
|
||||
paragraph_keyword = []
|
||||
|
||||
# 使用 python-docx 模块的检索 已废弃
|
||||
# # 检索所有段落
|
||||
# for para in doc_file.paragraphs:
|
||||
# this_para_keyword = []
|
||||
# for keyword in keyword_list: # 可能存在性能问题
|
||||
# if keyword in para.text and not this_para_keyword: # 如果keyword在para.text中,且该段落没有被标记过
|
||||
# find_list.append(para.text)
|
||||
# this_para_keyword.append(keyword)
|
||||
# elif keyword in para.text and this_para_keyword: # 如果keyword在para.text中,且该段落已经被标记过
|
||||
# this_para_keyword.append(keyword)
|
||||
# else:
|
||||
# continue
|
||||
#
|
||||
# if this_para_keyword:
|
||||
# # 若该段落被标记过,则将该段落的所有keyword加入到paragraph_keyword中
|
||||
# paragraph_keyword.append(this_para_keyword)
|
||||
# # 检索所有页眉
|
||||
# for section in doc_file.sections:
|
||||
# header = section.header
|
||||
# if header is not None:
|
||||
# for para in header.paragraphs:
|
||||
# this_para_keyword = []
|
||||
# for keyword in keyword_list:
|
||||
# if keyword in para.text and not this_para_keyword:
|
||||
# find_list.append(para.text)
|
||||
# this_para_keyword.append(keyword)
|
||||
# elif keyword in para.text and this_para_keyword:
|
||||
# this_para_keyword.append(keyword)
|
||||
# else:
|
||||
# continue
|
||||
|
||||
for para in doc_text.split('\n'):
|
||||
this_para_keyword = [keyword for keyword in keyword_list if keyword in para] # 查找该段落中的敏感词
|
||||
|
||||
if this_para_keyword:
|
||||
# 若该段落中有敏感词,则将该段落的所有keyword及该段落分别加入到paragraph_keyword和find_list中
|
||||
paragraph_keyword.append(this_para_keyword)
|
||||
find_list.append(para)
|
||||
|
||||
print(len(find_list))
|
||||
find_dict['find_list'] = find_list
|
||||
find_dict['paragraph_keyword'] = paragraph_keyword
|
||||
return find_dict
|
||||
|
||||
|
||||
# print(docx_find('浅水海底电缆打捞大作业.docx', ['机密', '铲出']))
|
||||
def util_keyword_find(file_path: str, keyword_list: list) -> dict:
|
||||
"""
|
||||
对指定单一文件进行敏感词查找
|
||||
:param file_path: 文件路径
|
||||
:param keyword_list: 列表形式的敏感词
|
||||
:return: 返回一个字典,包含文件名和敏感词所在的段落 {'file_name': 'xxx.docx', 'find_list': ['段落1', '段落2', ...]}
|
||||
"""
|
||||
# check file type
|
||||
if file_path.endswith('.docx'):
|
||||
find_dict = docx_find(file_path, keyword_list) # 调用docx_find函数
|
||||
for para, this_para_keyword in zip(find_dict['find_list'], find_dict['paragraph_keyword']):
|
||||
marked_para = insert_html_tag(para, keyword_list)
|
||||
find_dict['find_list'][find_dict['find_list'].index(para)] = marked_para
|
||||
# print(find_dict)
|
||||
return find_dict
|
||||
|
||||
else:
|
||||
return {'file_name': file_path, 'find_list': []}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(util_keyword_find('浅水海底电缆打捞大作业.docx', ['打捞', '浅水']))
|
|
@ -0,0 +1,7 @@
|
|||
from django.urls import path, include
|
||||
from . import views
|
||||
|
||||
urlpatterns = [
|
||||
path('', views.KeywordRetrieverView.as_view(), name='index'),
|
||||
path('<uuid:task_uuid>', views.TaskViewerView.as_view(), name='task_viewer')
|
||||
]
|
|
@ -0,0 +1,47 @@
|
|||
from django.shortcuts import render
|
||||
from django.urls import reverse_lazy
|
||||
from django.views import View
|
||||
from django.views.generic import FormView
|
||||
|
||||
from retriever.forms import UploadForm
|
||||
from retriever.models import RetrieverTask, UploadFile, KeywordParagraph, Keywords
|
||||
from .tasks import start_retriever_job
|
||||
|
||||
# Create your views here.
|
||||
|
||||
|
||||
class KeywordRetrieverView(FormView):
|
||||
form_class = UploadForm
|
||||
template_name = "index.html" # Replace with your template.
|
||||
|
||||
# def get(self, request, *args, **kwargs):
|
||||
# form = FileFieldForm()
|
||||
# return render(request, 'index.html', {'form': form})
|
||||
|
||||
def form_valid(self, form):
|
||||
# 创建任务
|
||||
keyword_list = list(Keywords.objects.filter(is_active=True).values_list('keyword', flat=True))
|
||||
r_task = RetrieverTask.objects.create(task_keywords=keyword_list)
|
||||
self.task_uuid = r_task.task_uuid
|
||||
|
||||
# print(r_task.task_keywords)
|
||||
# 将上传的文件保存到数据库
|
||||
for each in form.cleaned_data['attachments']:
|
||||
UploadFile.objects.create(related_task=r_task, file_name=each.name, file=each)
|
||||
return super(KeywordRetrieverView, self).form_valid(form)
|
||||
|
||||
def get_success_url(self):
|
||||
# 重写success_url,跳转到任务查看页面
|
||||
return reverse_lazy('task_viewer', kwargs={'task_uuid': self.task_uuid})
|
||||
|
||||
|
||||
class TaskViewerView(View):
|
||||
def get(self, requests, task_uuid):
|
||||
# print(self.kwargs['task_uuid'])
|
||||
current_task = RetrieverTask.objects.get(task_uuid=task_uuid) # 获取当前任务
|
||||
if not current_task.task_started:
|
||||
# 如果任务未开始,则调用异步任务
|
||||
current_task.task_started = True
|
||||
current_task.save()
|
||||
start_retriever_job.delay(current_task.task_uuid) # 调用异步任务
|
||||
return render(requests, 'task_viewer.html', {'current_task': current_task})
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,100 @@
|
|||
{% load static %}
|
||||
{% load unicorn %}
|
||||
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta http-equiv="X-UA-Compatible" content="IE=edge">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, shrink-to-fit=no">
|
||||
<meta name="author" content="Aldi Duzha">
|
||||
<meta name="description" content="Search houses and apartments for rent anywhere within the US. View floorplans, pricing, images and more. Find your perfect rental.">
|
||||
<meta name="keywords" content="bulma, rent, template, apartments, page, website, free, awesome">
|
||||
<title>寻章智搜</title>
|
||||
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@5.13.0/css/all.min.css">
|
||||
<link rel="stylesheet" type="text/css" href="{% static 'bulma.min.css' %}">
|
||||
{% unicorn_scripts %}
|
||||
</head>
|
||||
<body>
|
||||
{% csrf_token %}
|
||||
<div id="app">
|
||||
<section class="hero is-fullheight is-light" >
|
||||
<div class="hero-head">
|
||||
<nav class="navbar is-transparent is-spaced" role="navigation" aria-label="main navigation">
|
||||
<div class="container">
|
||||
<div class="navbar-brand">
|
||||
<a class="navbar-item" href="/">
|
||||
{# <img src="https://bulma.io/images/bulma-logo.png" alt="Bulma Rent" width="80" height="20">#}
|
||||
<span class="is-size-3 has-text-weight-semibold">寻章智搜</span>
|
||||
</a>
|
||||
<a role="button" class="navbar-burger burger" aria-label="menu" aria-expanded="false" data-target="navbarTopMain">
|
||||
<span aria-hidden="true"></span>
|
||||
<span aria-hidden="true"></span>
|
||||
<span aria-hidden="true"></span>
|
||||
</a>
|
||||
</div>
|
||||
<div class="navbar-menu" id="navbarTopMain">
|
||||
<div class="navbar-end">
|
||||
|
||||
<a href="#" class="navbar-item has-text-weight-semibold">Post a listing</a>
|
||||
<div class="navbar-item">
|
||||
<a href="https://aldi.github.io/awesome-bulma-templates/templates/login/login.html" class="button is-primary">
|
||||
登录
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</nav>
|
||||
</div>
|
||||
{% block content %}
|
||||
{% endblock %}
|
||||
|
||||
|
||||
</section>
|
||||
|
||||
<footer class="hero is-small is-light">
|
||||
<div class="hero-body">
|
||||
<div class="container has-text-centered">
|
||||
<a href="https://raiot.me">
|
||||
Made with <span class="has-text-danger">❤</span> by Raiot
|
||||
</a>
|
||||
<div class="columns m-t-10">
|
||||
<div class="column">
|
||||
<nav class="has-text-grey-light">
|
||||
<a href="#" class="has-text-primary">About</a> •
|
||||
</nav>
|
||||
</div>
|
||||
</div>
|
||||
<div class="b-t m-t-30 p-t-30 has-text-grey-light is-size-7">
|
||||
Rent Template 2020 © Aldi Duzha <br> <a href="#" class="has-text-primary">Terms of use</a> and <a class="has-text-primary" href="#">Privacy policy</a>.
|
||||
<a href="#" class="has-text-primary">Fair Housing</a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</footer>
|
||||
</div>
|
||||
<script>
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
// Get all "navbar-burger" elements
|
||||
const $navbarBurgers = Array.prototype.slice.call(document.querySelectorAll('.navbar-burger'), 0);
|
||||
// Check if there are any navbar burgers
|
||||
if ($navbarBurgers.length > 0) {
|
||||
// Add a click event on each of them
|
||||
$navbarBurgers.forEach( el => {
|
||||
el.addEventListener('click', () => {
|
||||
// Get the target from the "data-target" attribute
|
||||
const target = el.dataset.target;
|
||||
const $target = document.getElementById(target);
|
||||
// Toggle the "is-active" class on both the "navbar-burger" and the "navbar-menu"
|
||||
el.classList.toggle('is-active');
|
||||
$target.classList.toggle('is-active');
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
{% block script %}
|
||||
{% endblock %}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
|
@ -0,0 +1,52 @@
|
|||
{% extends 'base.html' %}
|
||||
{% load static %}
|
||||
{% load unicorn %}
|
||||
|
||||
|
||||
|
||||
{% block content %}
|
||||
<div class="hero-body p-b-30 ">
|
||||
<div class="container">
|
||||
<h2 class="subtitle">
|
||||
<span class="has-text-centered is-block">
|
||||
Search hundreds of thousands of apartments, condos and houses for rent.
|
||||
</span>
|
||||
</h2>
|
||||
<h1 class="title">
|
||||
<span class="is-size-2 has-text-centered is-block">文件敏感词检测工具</span>
|
||||
</h1>
|
||||
<div class="container">
|
||||
<div class="notification">
|
||||
<span class="has-text-centered is-block">批量上传</span>
|
||||
<form method="POST" enctype="multipart/form-data">
|
||||
{% csrf_token %}
|
||||
<div class="columns">
|
||||
<div class="column has-text-centered">
|
||||
<div class="file is-boxed">
|
||||
<label class="file-label">
|
||||
{{ form.attachments }}
|
||||
<span class="file-cta">
|
||||
<span class="file-icon">
|
||||
<i class="fas fa-upload"></i>
|
||||
</span>
|
||||
<span class="file-label">
|
||||
选择多个文件
|
||||
</span>
|
||||
</span>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<br>
|
||||
<input type="submit" value="提交" class="button is-link is-medium">
|
||||
</form>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
{% endblock %}
|
Loading…
Reference in New Issue