Source code for dedoc.metadata_extractors.abstract_metadata_extractor
from abc import ABC, abstractmethod
from typing import Optional
from dedoc.data_structures.unstructured_document import UnstructuredDocument
[docs]class AbstractMetadataExtractor(ABC):
"""
This class is responsible for extracting metadata from the documents of different formats.
"""
[docs] @abstractmethod
def can_extract(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> bool:
"""
Check if this extractor can handle the given file. Return True if the extractor can handle it and False otherwise.
Look to the :meth:`~dedoc.metadata_extractors.AbstractMetadataExtractor.add_metadata` documentation to get the information about parameters.
"""
pass
[docs] @abstractmethod
def add_metadata(self,
document: UnstructuredDocument,
directory: str,
filename: str,
converted_filename: str,
original_filename: str,
parameters: Optional[dict] = None,
other_fields: Optional[dict] = None) -> UnstructuredDocument:
"""
Add metadata to the document if possible, i.e. method :meth:`can_extract` returned True.
:type document: document content that has been received from some of the readers
:type directory: path to the directory where the original and converted files are located
:type filename: name of the file after renaming (for example 23141.doc). \
The file gets a new name during processing by the dedoc manager (if used)
:type converted_filename: name of the file after renaming and conversion (for example 23141.docx)
:type original_filename: name of the file before renaming
:type parameters: additional parameters for document parsing
:type other_fields: other fields that should be added to the document's metadata
:return: document content with added metadata attribute (dict with information about the document)
"""
pass