Source code for dedoc.readers.note_reader.note_reader

from typing import Optional

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class NoteReader(BaseReader): """ This class is used for parsing documents with .note.pickle extension. """ def __init__(self, *, config: Optional[dict] = None) -> None: super().__init__(config=config, recognized_extensions={".note.pickle"})
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ import os import pickle from dedoc.common.exceptions.bad_file_error import BadFileFormatError from dedoc.data_structures.line_with_meta import LineWithMeta try: with open(file_path, "rb") as infile: note_dict = pickle.load(infile) text = note_dict["content"] if isinstance(text, bytes): text = text.decode() lines = [LineWithMeta(line=text)] unstructured = UnstructuredDocument(tables=[], lines=lines, attachments=[]) return unstructured except Exception as e: self.logger.warning(f"Can't handle {file_path}\n{e}") raise BadFileFormatError(f"Bad note file:\n file_name = {os.path.basename(file_path)}. Seems note-format is broken")