Source code for dedoc.data_structures.unstructured_document

from typing import List, Optional

from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table


[docs]class UnstructuredDocument: """ This class holds information about raw document content: its text, tables and attachments, that have been procured using one of the readers. Text is represented as a flat list of lines, hierarchy level of each line isn't defined (only tag hierarchy level may exist). """
[docs] def __init__(self, tables: List[Table], lines: List[LineWithMeta], attachments: List[AttachedFile], warnings: List[str] = None, metadata: Optional[dict] = None) -> None: """ :param tables: list of document tables :param lines: list of raw document lines :param attachments: list of documents attachments :param warnings: list of warnings, obtained in the process of the document parsing :param metadata: additional data """ self.tables = tables self.lines = lines self.attachments = attachments self.warnings = warnings if warnings else [] self.metadata = metadata if metadata is not None else {}