Source code for dedoc.attachments_extractors.abstract_attachment_extractor
import os
import uuid
from abc import ABC, abstractmethod
from typing import List, Optional, Tuple
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.utils.utils import save_data_to_unique_file
[docs]class AbstractAttachmentsExtractor(ABC):
"""
This class is responsible for extracting files attached to the documents of different formats.
"""
[docs] @abstractmethod
def can_extract(self, extension: str, mime: str, parameters: Optional[dict] = None) -> bool:
"""
Check if this attachments extractor can get attachments of the file with the given extension.
:param extension: file extension, for example .doc or .pdf
:param mime: MIME type of file
:param parameters: any additional parameters for given document
:return: the indicator of possibility to get attachments of this file
"""
pass
[docs] @abstractmethod
def get_attachments(self, tmpdir: str, filename: str, parameters: dict) -> List[AttachedFile]:
"""
Extract attachments from the given file.
This method can only be called on appropriate files, ensure that \
:meth:`~dedoc.attachments_extractors.AbstractAttachmentsExtractor.can_extract` is True for the given file.
:param tmpdir: directory where file is located and where the attached files will be saved
:param filename: name of the file to extract attachments (not absolute path)
:param parameters: dict with different parameters for extracting
:return: list of file's attachments
"""
pass
[docs] @staticmethod
def with_attachments(parameters: dict) -> bool:
"""
Check if the option `with_attachments` is true in the parameters.
:param parameters: parameters for the attachment extractor
:return: indicator if with_attachments option is true
"""
return str(parameters.get("with_attachments", "false")).lower() == "true"
def _content2attach_file(self, content: List[Tuple[str, bytes]], tmpdir: str, need_content_analysis: bool) -> List[AttachedFile]:
attachments = []
for original_name, contents in content:
tmp_file_name = save_data_to_unique_file(directory=tmpdir, filename=original_name, binary_data=contents)
tmp_file_path = os.path.join(tmpdir, tmp_file_name)
file = AttachedFile(original_name=original_name,
tmp_file_path=tmp_file_path,
uid=f"attach_{uuid.uuid1()}",
need_content_analysis=need_content_analysis)
attachments.append(file)
return attachments