Source code for dedoc.metadata_extractors.abstract_metadata_extractor

from abc import ABC, abstractmethod
from typing import Optional, Set, Tuple


[docs]class AbstractMetadataExtractor(ABC): """ This class is responsible for extracting metadata from the documents of different formats. """
[docs] def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Optional[Set[str]] = None, recognized_mimes: Optional[Set[str]] = None) -> None: """ :param config: configuration of the extractor, e.g. logger for logging :param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf} :param recognized_mimes: set of supported MIME types of files """ import logging self.config = {} if config is None else config self.logger = self.config.get("logger", logging.getLogger()) self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions self._recognized_mimes = {} if recognized_mimes is None else recognized_mimes
[docs] def can_extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, parameters: Optional[dict] = None, mime: Optional[str] = None, extension: Optional[str] = None) -> bool: """ Check if this extractor can handle the given file. :param file_path: path to the file to extract metadata. \ If dedoc manager is used, the file gets a new name during processing - this name should be passed here (for example 23141.doc) :param converted_filename: name of the file after renaming and conversion (if dedoc manager is used, for example 23141.docx), \ by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting. :param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path :param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details :param mime: MIME type of a file :param extension: file extension, for example .doc or .pdf :return: True if the extractor can handle the given file and False otherwise """ import os from dedoc.utils.utils import get_mime_extension file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename) converted_file_path = os.path.join(file_dir, converted_filename) mime, extension = get_mime_extension(file_path=converted_file_path, mime=mime, extension=extension) return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes
[docs] @abstractmethod def extract(self, file_path: str, converted_filename: Optional[str] = None, original_filename: Optional[str] = None, parameters: Optional[dict] = None) -> dict: """ Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True. :param file_path: path to the file to extract metadata. \ If dedoc manager is used, the file gets a new name during processing - this name should be passed here (for example 23141.doc) :param converted_filename: name of the file after renaming and conversion (if dedoc manager is used, for example 23141.docx), \ by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting. :param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path :param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details :return: dict with metadata information about the document """ pass
def _get_names(self, file_path: str, converted_filename: Optional[str], original_filename: Optional[str]) -> Tuple[str, str, str, str]: import os file_dir, file_name = os.path.split(file_path) converted_filename = file_name if converted_filename is None else converted_filename original_filename = file_name if original_filename is None else original_filename return file_dir, file_name, converted_filename, original_filename