Source code for dedoc.metadata_extractors.abstract_metadata_extractor
from abc import ABC, abstractmethod
from typing import Optional
[docs]class AbstractMetadataExtractor(ABC):
"""
This class is responsible for extracting metadata from the documents of different formats.
"""
[docs] @abstractmethod
def can_extract(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> bool:
"""
Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.extract_metadata` documentation to get the information about parameters.
"""
pass
[docs] @abstractmethod
def extract_metadata(self,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> dict:
"""
Extract metadata from file if possible, i.e. method :meth:`can_extract` returned True.
:param directory: path to the directory where the original and converted files are located
:param filename: name of the file after renaming (for example 23141.doc). \
The file gets a new name during processing by the dedoc manager (if used)
:param converted_filename: name of the file after renaming and conversion (for example 23141.docx)
:param original_filename: name of the file before renaming
:param parameters: additional parameters for document parsing
:param other_fields: other fields that should be added to the document's metadata
:return: dict with metadata information about the document
"""
pass