Source code for dedoc.attachments_extractors.abstract_attachment_extractor

from abc import ABC, abstractmethod
from typing import List, Optional, Set, Tuple

from dedoc.data_structures.attached_file import AttachedFile


[docs]class AbstractAttachmentsExtractor(ABC): """ This class is responsible for extracting files attached to the documents of different formats. """
[docs] def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Optional[Set[str]] = None, recognized_mimes: Optional[Set[str]] = None) -> None: """ :param config: configuration of the attachments extractor, e.g. logger for logging :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ import logging self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions self._recognized_mimes = {} if recognized_mimes is None else recognized_mimes
[docs] def can_extract(self, file_path: Optional[str] = None, extension: Optional[str] = None, mime: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if this attachments extractor can get attachments of the file. You should provide at least one of the following parameters: file_path, extension, mime. :param file_path: the path of the file to extract attachments from :param extension: file extension with a dot, for example .doc or .pdf :param mime: MIME type of file :param parameters: any additional parameters for the given document :return: the indicator of possibility to get attachments of this file """ from dedoc.utils.utils import get_mime_extension mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes
[docs] @abstractmethod def extract(self, file_path: str, parameters: Optional[dict] = None) -> List[AttachedFile]: """ Extract attachments from the given file. This method can only be called on appropriate files, ensure that \ :meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` is True for the given file. :param file_path: path of the file to extract attachments from :param parameters: dict with different parameters for extracting, see :ref:`attachments_handling_parameters` for more details :return: list of file's attachments """ pass
[docs] @staticmethod def with_attachments(parameters: dict) -> bool: """ Check if the option `with_attachments` is true in the parameters. :param parameters: parameters for the attachment extractor :return: indicator if with_attachments option is true """ return str(parameters.get("with_attachments", "false")).lower() == "true"
def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool, parameters: dict) -> List[AttachedFile]: import os import uuid from dedoc.utils.parameter_utils import get_param_attachments_dir from dedoc.utils.utils import save_data_to_unique_file attachments = [] attachments_dir = get_param_attachments_dir(parameters, tmpdir) for original_name, contents in content: tmp_file_name = save_data_to_unique_file(directory=attachments_dir, filename=original_name, binary_data=contents) tmp_file_path = os.path.join(attachments_dir, tmp_file_name) file = AttachedFile(original_name=original_name, tmp_file_path=tmp_file_path, uid=f"attach_{uuid.uuid4()}", need_content_analysis=need_content_analysis) attachments.append(file) return attachments