Source code for dedoc.dedoc_manager

from typing import Dict, Optional, Tuple

from dedoc.api.api_args import QueryParameters
from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.common.exceptions.conversion_error import ConversionError
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.data_structures.parsed_document import ParsedDocument
from dedoc.data_structures.unstructured_document import UnstructuredDocument


[docs]class DedocManager: """ This class allows to run the whole pipeline of the document processing: 1. Converting 2. Reading 3. Metadata extraction 4. Structure extraction 5. Output structure construction 6. Attachments handling """
[docs] def __init__(self, config: Optional[dict] = None, manager_config: Optional[dict] = None) -> None: """ :param config: config for document processing :param manager_config: dictionary with different stage document processors. The following keys should be in the `manager_config` dictionary: - converter (optional) (:class:`~dedoc.converters.ConverterComposition`) - reader (:class:`~dedoc.readers.ReaderComposition`) - structure_extractor (:class:`~dedoc.structure_extractors.StructureExtractorComposition`) - structure_constructor (:class:`~dedoc.structure_constructors.StructureConstructorComposition`) - document_metadata_extractor (:class:`~dedoc.metadata_extractors.MetadataExtractorComposition`) - attachments_handler (:class:`~dedoc.attachments_handler.AttachmentsHandler`) """ import logging from dedoc.config import get_config from dedoc.manager_config import get_manager_config self.config = get_config() if config is None else config self.logger = self.config.get("logger", logging.getLogger()) manager_config = get_manager_config(self.config) if manager_config is None else manager_config self.converter = manager_config.get("converter", None) self.reader = manager_config.get("reader", None) assert self.reader is not None, "Reader shouldn't be None" self.structure_extractor = manager_config.get("structure_extractor", None) assert self.structure_extractor is not None, "Structure extractor shouldn't be None" self.structure_constructor = manager_config.get("structure_constructor", None) assert self.structure_constructor is not None, "Structure constructor shouldn't be None" self.document_metadata_extractor = manager_config.get("document_metadata_extractor", None) assert self.document_metadata_extractor is not None, "Document metadata extractor shouldn't be None" self.attachments_handler = manager_config.get("attachments_handler", None) assert self.attachments_handler is not None, "Attachments handler shouldn't be None" self.default_parameters = QueryParameters().to_dict()
[docs] def parse(self, file_path: str, parameters: Optional[Dict[str, str]] = None) -> ParsedDocument: """ Run the whole pipeline of the document processing. If some error occurred, file metadata are stored in the exception's metadata field. :param file_path: full path where the file is located :param parameters: any parameters, specify how to parse file, see :ref:`parameters_description` for more details :return: parsed document """ import os.path parameters = self.__init_parameters(file_path, parameters) self.logger.info(f"Get file {os.path.basename(file_path)} with parameters {parameters}") try: return self.__parse_no_error_handling(file_path=file_path, parameters=parameters) except DedocError as e: from dedoc.metadata_extractors.concrete_metadata_extractors.base_metadata_extractor import BaseMetadataExtractor file_dir, file_name = os.path.split(file_path) e.filename = file_name e.metadata = BaseMetadataExtractor._get_base_meta_information(directory=file_dir, filename=file_name, name_actual=file_name) raise e
def __parse_no_error_handling(self, file_path: str, parameters: Dict[str, str]) -> ParsedDocument: """ Function of complete document parsing without errors handling. :param file_path: full path where the file is located :param parameters: any parameters, specify how to parse file :return: parsed document """ import os.path import shutil import tempfile from dedoc.utils.utils import get_unique_name if not os.path.isfile(path=file_path): raise FileNotFoundError(file_path) self.logger.info(f"Start handle {file_path}") file_dir, file_name = os.path.split(file_path) unique_filename = get_unique_name(file_name) with tempfile.TemporaryDirectory() as tmp_dir: tmp_file_path = os.path.join(tmp_dir, unique_filename) shutil.copy(file_path, tmp_file_path) # Steps 1-3 - Converting, Reading content and Adding meta-information converted_file_path, unstructured_document = self.__read_with_mime_auto_detection( file_name=file_name, parameters=parameters, file_path=tmp_file_path ) self.logger.info(f"Extract content from file {file_name}") # Step 4 - Extract structure unstructured_document = self.structure_extractor.extract(unstructured_document, parameters) self.logger.info(f"Extract structure from file {file_name}") if self.config.get("labeling_mode", False): self.__save(converted_file_path, unstructured_document) # Step 5 - Form the output structure parsed_document = self.structure_constructor.construct(document=unstructured_document, parameters=parameters) self.logger.info(f"Get structured document {file_name}") # Step 6 - Get attachments attachments = self.attachments_handler.handle_attachments(document_parser=self, document=unstructured_document, parameters=parameters) parsed_document.add_attachments(attachments) self.logger.info(f"Get attachments {file_name}") self.logger.info(f"Finish handle {file_name}") return parsed_document def __init_parameters(self, file_path: str, parameters: Optional[dict]) -> dict: import os.path parameters = {} if parameters is None else parameters result_parameters = {} for parameter_name, parameter_value in self.default_parameters.items(): result_parameters[parameter_name] = parameters.get(parameter_name, parameter_value) attachments_dir = parameters.get("attachments_dir", None) result_parameters["attachments_dir"] = os.path.dirname(file_path) if attachments_dir is None else attachments_dir return result_parameters def __read_with_mime_auto_detection(self, file_path: str, file_name: str, parameters: Optional[dict]) -> Tuple[str, UnstructuredDocument]: import os.path from dedoc.extensions import mime2extension from dedoc.utils.utils import get_file_mime_by_content, get_mime_extension # firstly, try to read file using its original extension mime, extension = get_mime_extension(file_path=file_path) try: converted_file_path, document = self.__parse_file(file_path=file_path, file_name=file_name, parameters=parameters, mime=mime, extension=extension) except (ConversionError, BadFileFormatError) as e: # secondly, try to read file using mime obtained by file's content detected_mime = get_file_mime_by_content(file_path) detected_extension = mime2extension.get(detected_mime, "") self.logger.warning(f'Could not read file {file_name} with mime = "{mime}", extension = "{extension}" ({e}). ' f'Detected file mime = "{detected_mime}", extension = "{detected_extension}"') fixed_file_path = f"{file_path}{detected_extension}" os.rename(file_path, fixed_file_path) converted_file_path, document = self.__parse_file( file_path=fixed_file_path, file_name=file_name, parameters=parameters, mime=detected_mime, extension=detected_extension ) document.warnings.append(f'Incorrect extension "{extension}". Detected mime = "{detected_mime}", extension = "{detected_extension}"') return converted_file_path, document def __parse_file(self, file_path: str, file_name: str, parameters: Optional[dict], extension: str, mime: str) -> Tuple[str, UnstructuredDocument]: import os.path from dedoc.utils.utils import get_mime_extension converted_file_path = self.converter.convert(file_path, parameters=parameters, mime=mime, extension=extension) if converted_file_path != file_path: mime, extension = get_mime_extension(file_path=converted_file_path) unstructured_document = self.reader.read(file_path=converted_file_path, parameters=parameters, mime=mime, extension=extension) metadata = self.document_metadata_extractor.extract(file_path=file_path, converted_filename=os.path.basename(converted_file_path), original_filename=file_name, parameters=parameters, mime=mime, extension=extension) unstructured_document.metadata = {**unstructured_document.metadata, **metadata} return converted_file_path, unstructured_document def __save(self, file_path: str, classified_document: UnstructuredDocument) -> None: import os.path import shutil from dedoc.utils.train_dataset_utils import get_path_original_documents, save_line_with_meta self.logger.info(f'Save document lines to {self.config["intermediate_data_path"]}') save_line_with_meta(lines=classified_document.lines, config=self.config, original_document=os.path.basename(file_path)) shutil.copy(file_path, os.path.join(get_path_original_documents(self.config), os.path.basename(file_path)))