Source code for dedoc.dedoc_manager

import logging
import os.path
import shutil
import tempfile
from typing import Dict, Optional

from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.config import get_config
from dedoc.data_structures import ParsedDocument, UnstructuredDocument
from dedoc.manager_config import get_manager_config
from dedoc.metadata_extractors import BaseMetadataExtractor
from dedoc.train_dataset.train_dataset_utils import get_path_original_documents, save_line_with_meta
from dedoc.utils.utils import get_unique_name


[docs]class DedocManager: """ This class allows to run the whole pipeline of the document processing: 1. Converting 2. Reading 3. Metadata extraction 4. Structure extraction 5. Output structure construction 6. Attachments handling """
[docs] def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] = None) -> None: """ :param config: config for document processing :param manager_config: dictionary with different stage document processors. The following keys should be in the `manager_config` dictionary: - converter (optional) (:class:`~dedoc.converters.FileConverterComposition`) - reader (:class:`~dedoc.readers.ReaderComposition`) - structure_extractor (:class:`~dedoc.structure_extractors.StructureExtractorComposition`) - structure_constructor (:class:`~dedoc.structure_constructors.StructureConstructorComposition`) - document_metadata_extractor (:class:`~dedoc.metadata_extractors.MetadataExtractorComposition`) - attachments_handler (:class:`~dedoc.attachments_handler.AttachmentsHandler`) """ self.config = get_config() if config is None else config self.logger = self.config.get("logger", logging.getLogger()) manager_config = get_manager_config(self.config) if manager_config is None else manager_config self.converter = manager_config.get("converter", None) self.reader = manager_config.get("reader", None) assert self.reader is not None, "Reader shouldn't be None" self.structure_extractor = manager_config.get("structure_extractor", None) assert self.structure_extractor is not None, "Structure extractor shouldn't be None" self.structure_constructor = manager_config.get("structure_constructor", None) assert self.structure_constructor is not None, "Structure constructor shouldn't be None" self.document_metadata_extractor = manager_config.get("document_metadata_extractor", None) assert self.document_metadata_extractor is not None, "Document metadata extractor shouldn't be None" self.attachments_handler = manager_config.get("attachments_handler", None) assert self.attachments_handler is not None, "Attachments handler shouldn't be None"
[docs] def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> ParsedDocument: """ Run the whole pipeline of the document processing. If some error occurred, file metadata are stored in the exception's metadata field. :param file_path: full path where the file is located :param parameters: any parameters, specify how to parse file (see API parameters documentation for more details) :return: parsed document """ parameters = {} if parameters is None else parameters try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) except DedocError as e: file_dir, file_name = os.path.split(file_path) e.filename = file_name e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) raise e
def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) -> ParsedDocument: """ Function of complete document parsing without errors handling. :param file_path: full path where the file is located :param parameters: any parameters, specify how to parse file :return: parsed document """ if not os.path.isfile(path=file_path): raise FileNotFoundError() self.logger.info(f"Start handle {file_path}") file_dir, file_name = os.path.split(file_path) unique_filename = get_unique_name(file_name) with tempfile.TemporaryDirectory() as tmp_dir: shutil.copy(file_path, os.path.join(tmp_dir, unique_filename)) # Step 1 - Converting converted_filename = self.converter.do_converting(tmp_dir, unique_filename, parameters=parameters) self.logger.info(f"Finish conversion {file_name} -> {converted_filename}") # Step 2 - Reading content unstructured_document = self.reader.parse_file(tmp_dir=tmp_dir, filename=converted_filename, parameters=parameters) self.logger.info(f"Finish parse file {file_name}") # Step 3 - Adding meta-information unstructured_document = self.document_metadata_extractor.add_metadata(document=unstructured_document, directory=tmp_dir, filename=unique_filename, converted_filename=converted_filename, original_filename=file_name, parameters=parameters, other_fields=unstructured_document.metadata) self.logger.info(f"Add metadata of file {file_name}") # Step 4 - Extract structure unstructured_document = self.structure_extractor.extract_structure(unstructured_document, parameters) self.logger.info(f"Extract structure from file {file_name}") if self.config.get("labeling_mode", False): self.__save(os.path.join(tmp_dir, unique_filename), unstructured_document) # Step 5 - Form the output structure parsed_document = self.structure_constructor.structure_document(document=unstructured_document, structure_type=parameters.get("structure_type"), parameters=parameters) self.logger.info(f"Get structured document {file_name}") # Step 6 - Get attachments attachments = self.attachments_handler.handle_attachments(document_parser=self, document=unstructured_document, parameters=parameters) parsed_document.add_attachments(attachments) self.logger.info(f"Get attachments {file_name}") self.logger.info(f"Finish handle {file_name}") return parsed_document def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path)) shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path)))