Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

469 Дополнение модели данных для хранения распарсенного текста #496

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion app/db/db_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
checks_collection = db['checks']
consumers_collection = db['consumers']
criteria_pack_collection = db['criteria_pack']
parsed_texts_collection = db['parsed_texts']
logs_collection = db.create_collection(
'logs', capped=True, size=5242880) if not db['logs'] else db['logs']
celery_check_collection = db['celery_check'] # collection for mapping celery_task to check
Expand Down Expand Up @@ -144,6 +145,12 @@ def update_check(check):
return bool(checks_collection.find_one_and_replace({'_id': check._id}, check.pack()))


def add_parsed_text(check_id, parsed_text):
checks_id = parsed_texts_collection.insert_one(parsed_text.pack()).inserted_id
files_info_collection.update_one({'_id': check_id}, {"$push": {'parsed_texts': checks_id}})
return checks_id


def write_pdf(filename, filepath):
converted_filepath = convert_to(filepath, target_format='pdf')
return add_file_to_db(filename, converted_filepath)
Expand Down Expand Up @@ -228,7 +235,7 @@ def set_passbacked_flag(checks_id, flag):
def get_latest_users_check(filter=None):
local_filter = filter
user = local_filter.get('user')
username_filter = {'username': user} if user else {}
username_filter = {'username': user} if user else {}
all_users = [user['username'] for user in users_collection.find(username_filter, {'username': 1})]
latest_checks = []
for user in all_users:
Expand Down
10 changes: 10 additions & 0 deletions app/db/db_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from main.check_packs import BASE_PACKS, BaseCriterionPack, DEFAULT_TYPE_INFO, DEFAULT_REPORT_TYPE_INFO


class Packable:
def __init__(self, dictionary):
pass
Expand Down Expand Up @@ -104,6 +105,7 @@ def __init__(self, dictionary=None):
self.is_failed = dictionary.get('is_failed', None)
self.is_ended = dictionary.get('is_ended', True)
self.is_passed = dictionary.get('is_passed', int(self.score) == 1)
self.parsed_chapters = dictionary.get('parsed_chapters', [])

def calc_score(self):
# check after implementation criterion pack
Expand Down Expand Up @@ -145,3 +147,11 @@ def none_to_false(x):
is_ended = none_to_true(self.is_ended) # None for old checks => True, True->True, False->False
is_failed = none_to_false(self.is_failed) # None for old checks => False, True->True, False->False
return {'is_ended': is_ended, 'is_failed': is_failed}


class ParsedText(PackableWithId):
def __init__(self, dictionary=None):
super().__init__(dictionary)
dictionary = dictionary or {}
self.filename = dictionary.get('filename', '')
self.parsed_chapters = []
28 changes: 28 additions & 0 deletions app/main/reports/parse_file/parse_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import re


def parse_headers_and_pages(chapters, docx):
text_on_page = docx.pdf_file.get_text_on_page()
for page, text in text_on_page.items():
text = re.sub(r"(-\n)", "", text)
text = re.sub(r"\s\n", " ", text)
if "СОДЕРЖАНИЕ" in text:
continue
for chapter in chapters:
if chapter["header"] in text:
chapter["start_page"] = page
return chapters


def parse_chapters(docx):
chapters = []
for chapter in docx.chapters:
head = chapter["styled_text"]["text"]
if "ПРИЛОЖЕНИЕ" in head:
head = head.split(".")[0]
if chapter["child"] != [] and "heading" in chapter["style"]:
temp_text = ""
for i in range(len(chapter["child"])):
temp_text += chapter["child"][i]["styled_text"]["text"]
chapters.append({"header": head, "start_page": 0, "text": temp_text})
return chapters
5 changes: 3 additions & 2 deletions app/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from os.path import join
from sys import argv
from io import StringIO

import bson
import pandas as pd
from bson import ObjectId
Expand Down Expand Up @@ -213,6 +212,7 @@ def run_task():
converted_id = db_methods.add_file_to_db(filenamepdf, filepathpdf)
else:
converted_id = db_methods.write_pdf(filename, filepath)

check = Check({
'_id': file_id,
'conv_pdf_fs_id': converted_id,
Expand All @@ -225,7 +225,8 @@ def run_task():
'score': -1, # score=-1 -> checking in progress
'is_ended': False,
'is_failed': False,
'params_for_passback': current_user.params_for_passback
'params_for_passback': current_user.params_for_passback,
'parsed_chapters': []
})
db_methods.add_check(file_id, check) # add check for parsed_file to db
task = create_task.delay(check.pack(to_str=True)) # add check to queue
Expand Down
16 changes: 14 additions & 2 deletions app/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,9 @@
from celery import Celery

import passback_grades
from app.main.reports.parse_file.parse_file import parse_headers_and_pages, parse_chapters
from db import db_methods
from db.db_types import Check
from db.db_types import Check, ParsedText
from main.checker import check
from main.parser import parse
from main.check_packs import BASE_PACKS
Expand Down Expand Up @@ -41,10 +42,21 @@ def create_task(self, check_info):
original_filepath = join(FILES_FOLDER, f"{check_id}.{check_obj.filename.rsplit('.', 1)[-1]}")
pdf_filepath = join(FILES_FOLDER, f"{check_id}.pdf")
try:
updated_check = check(parse(original_filepath, pdf_filepath), check_obj)
parsed_file_object = parse(original_filepath, pdf_filepath)
parsed_file_object.make_chapters(check_obj.file_type['report_type'])
parsed_file_object.make_headers(check_obj.file_type['report_type'])
chapters = parse_chapters(parsed_file_object)

updated_check = check(parsed_file_object, check_obj)
updated_check.is_ended = True
updated_check.is_failed = False
updated_check.parsed_chapters = parse_headers_and_pages(chapters, parsed_file_object)

parsed_text = ParsedText(check_info)
parsed_text.parsed_chapters = parse_headers_and_pages(chapters, parsed_file_object)

db_methods.update_check(updated_check) # save to db
db_methods.add_parsed_text(check_id, parsed_text)
db_methods.mark_celery_task_as_finished(self.request.id)

# remove files from FILES_FOLDER after checking
Expand Down