from typing import List, Optional
from dedoc.common.exceptions.dedoc_error import DedocError
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.document_metadata import DocumentMetadata
from dedoc.data_structures.parsed_document import ParsedDocument
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.dedoc_manager import DedocManager
[docs]class AttachmentsHandler:
"""
This class is used for handling attached files:
- they may be stored in the custom directory (use `attachments_dir` key in the parameters to set output directory path);
- they may be ignored (if the option `with_attachments=false` in parameters);
- the metadata of the attachments may be added without files parsing (if `with_attachments=true, need_content_analysis=false` in parameters)
- they may be parsed (if `with_attachments=true, need_content_analysis=true` in parameters), \
the parsing recursion may be set via `recursion_deep_attachments` parameter.
"""
[docs] def __init__(self, *, config: Optional[dict] = None) -> None:
"""
:param config: configuration of the handler, e.g. logger for logging
"""
import logging
self.config = {} if config is None else config
self.logger = self.config.get("logger", logging.getLogger())
[docs] def handle_attachments(self, document_parser: DedocManager, document: UnstructuredDocument, parameters: dict) -> List[ParsedDocument]:
"""
Handle attachments of the document in the intermediate representation.
:param document_parser: class with `parse` method for parsing attachments if needed;
:param document: intermediate representation of the document whose attachments need to be handled;
:param parameters: parameters for attachments handling (with_attachments, need_content_analysis, recursion_deep_attachments, attachments_dir \
are important, look to the API parameters documentation for more details).
:return: list of parsed document attachments
"""
import copy
import os
import time
from dedoc.utils.parameter_utils import get_param_with_attachments
attachments = []
recursion_deep_attachments = int(parameters.get("recursion_deep_attachments", 10)) - 1
if not get_param_with_attachments(parameters) or recursion_deep_attachments < 0:
return attachments
previous_log_time = time.time()
for i, attachment in enumerate(document.attachments):
current_time = time.time()
if current_time - previous_log_time > 3:
previous_log_time = current_time # not log too often
self.logger.info(f"Handle attachment {i} of {len(document.attachments)}")
if not attachment.get_original_filename(): # TODO check for docx https://jira.ispras.ru/browse/TLDR-185
continue
parameters_copy = copy.deepcopy(parameters)
parameters_copy["is_attached"] = True
parameters_copy["recursion_deep_attachments"] = str(recursion_deep_attachments)
try:
if attachment.need_content_analysis:
parsed_file = document_parser.parse(attachment.get_filename_in_path(), parameters=parameters_copy)
else:
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
parsed_file.metadata.file_name = attachment.original_name # initial name of the attachment
parsed_file.metadata.temporary_file_name = os.path.split(attachment.get_filename_in_path())[-1] # actual name in the file system
except DedocError:
# return empty ParsedDocument with Meta information
parsed_file = self.__get_empty_document(document_parser=document_parser, attachment=attachment, parameters=parameters_copy)
parsed_file.metadata.uid = attachment.uid
attachments.append(parsed_file)
return attachments
def __get_empty_document(self, document_parser: DedocManager, attachment: AttachedFile, parameters: dict) -> ParsedDocument:
from dedoc.utils.utils import get_empty_content
metadata = document_parser.document_metadata_extractor.extract(
file_path=attachment.get_filename_in_path(),
original_filename=attachment.get_original_filename(),
parameters=parameters
)
metadata = DocumentMetadata(**metadata)
return ParsedDocument(content=get_empty_content(), metadata=metadata)