Source code for dedoc.readers.note_reader.note_reader

import logging
import os
import pickle
from typing import Optional

from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class NoteReader(BaseReader): """ This class is used for parsing documents with .note.pickle extension. """
[docs] def __init__(self, *, config: dict) -> None: """ :param config: configuration of the reader, e.g. logger for logging """ self.config = config self.logger = config.get("logger", logging.getLogger())
[docs] def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ return extension.lower().endswith(".note.pickle")
[docs] def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ try: with open(path, "rb") as infile: note_dict = pickle.load(infile) text = note_dict["content"] if isinstance(text, bytes): text = text.decode() lines = [LineWithMeta(line=text, annotations=[], metadata=LineMetadata(line_id=0, page_id=0))] unstructured = UnstructuredDocument(tables=[], lines=lines, attachments=[]) return unstructured except Exception as e: self.logger.warning(f"Can't handle {path}\n{e}") raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(path)}. Seems note-format is broken")