Source code for dedoc.metadata_extractors.abstract_metadata_extractor
from abc import ABC, abstractmethod
from typing import Optional, Set, Tuple
[docs]class AbstractMetadataExtractor(ABC):
"""
This class is responsible for extracting metadata from the documents of different formats.
"""
[docs] def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Optional[Set[str]] = None, recognized_mimes: Optional[Set[str]] = None) -> None:
"""
:param config: configuration of the extractor, e.g. logger for logging
:param recognized_extensions: set of supported files extensions with a dot, for example {.doc, .pdf}
:param recognized_mimes: set of supported MIME types of files
"""
import logging
self.config = {} if config is None else config
self.logger = self.config.get("logger", logging.getLogger())
self._recognized_extensions = {} if recognized_extensions is None else recognized_extensions
self._recognized_mimes = {} if recognized_mimes is None else recognized_mimes
[docs] def can_extract(self,
file_path: str,
converted_filename: Optional[str] = None,
original_filename: Optional[str] = None,
parameters: Optional[dict] = None,
mime: Optional[str] = None,
extension: Optional[str] = None) -> bool:
"""
Check if this extractor can handle the given file.
:param file_path: path to the file to extract metadata. \
If dedoc manager is used, the file gets a new name during processing - this name should be passed here (for example 23141.doc)
:param converted_filename: name of the file after renaming and conversion (if dedoc manager is used, for example 23141.docx), \
by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting.
:param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path
:param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details
:param mime: MIME type of a file
:param extension: file extension, for example .doc or .pdf
:return: True if the extractor can handle the given file and False otherwise
"""
import os
from dedoc.utils.utils import get_mime_extension
file_dir, file_name, converted_filename, original_filename = self._get_names(file_path, converted_filename, original_filename)
converted_file_path = os.path.join(file_dir, converted_filename)
mime, extension = get_mime_extension(file_path=converted_file_path, mime=mime, extension=extension)
return extension.lower() in self._recognized_extensions or mime in self._recognized_mimes
[docs] @abstractmethod
def extract(self,
file_path: str,
converted_filename: Optional[str] = None,
original_filename: Optional[str] = None,
parameters: Optional[dict] = None) -> dict:
"""
Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True.
:param file_path: path to the file to extract metadata. \
If dedoc manager is used, the file gets a new name during processing - this name should be passed here (for example 23141.doc)
:param converted_filename: name of the file after renaming and conversion (if dedoc manager is used, for example 23141.docx), \
by default it's a name from the file_path. Converted file should be located in the same directory as the file before converting.
:param original_filename: name of the file before renaming (if dedoc manager is used), by default it's a name from the file_path
:param parameters: additional parameters for document parsing, see :ref:`parameters_description` for more details
:return: dict with metadata information about the document
"""
pass
def _get_names(self, file_path: str, converted_filename: Optional[str], original_filename: Optional[str]) -> Tuple[str, str, str, str]:
import os
file_dir, file_name = os.path.split(file_path)
converted_filename = file_name if converted_filename is None else converted_filename
original_filename = file_name if original_filename is None else original_filename
return file_dir, file_name, converted_filename, original_filename