Source code for dedoc.data_structures.parsed_document

from typing import List, Optional

from dedoc.api.schema.parsed_document import ParsedDocument as ApiParsedDocument
from dedoc.data_structures.document_content import DocumentContent
from dedoc.data_structures.document_metadata import DocumentMetadata
from dedoc.data_structures.serializable import Serializable


[docs]class ParsedDocument(Serializable): """ This class holds information about the document content, metadata and attachments. :ivar content: document text (hierarchy of nodes) and tables :ivar attachments: result of analysis of attached files (empty if with_attachments=False) :ivar metadata: document metadata such as size, creation date and so on. :ivar warnings: list of warnings and possible errors, arising in the process of document parsing :vartype content: DocumentContent :vartype attachments: List[ParsedDocument] :vartype metadata: DocumentMetadata :vartype warnings: List[str] """ def __init__(self, metadata: DocumentMetadata, content: DocumentContent, warnings: Optional[List[str]] = None, attachments: Optional[List["ParsedDocument"]] = None) -> None: """ :param metadata: document metadata such as size, creation date and so on. :param content: text and tables :param attachments: result of analysis of attached files :param warnings: list of warnings and possible errors, arising in the process of document parsing """ self.metadata: DocumentMetadata = metadata self.content: DocumentContent = content self.attachments: List["ParsedDocument"] = [] if attachments is None else attachments self.warnings: List[str] = warnings if warnings is not None else [] def add_attachments(self, new_attachment: List["ParsedDocument"]) -> None: if self.attachments is None: self.attachments = [] self.attachments.extend(new_attachment) def set_metadata(self, metadata: DocumentMetadata) -> None: self.metadata = metadata
[docs] def to_api_schema(self) -> ApiParsedDocument: import dedoc.version content = self.content.to_api_schema() metadata = self.metadata.to_api_schema() attachments = [attachment.to_api_schema() for attachment in self.attachments] if self.attachments is not None else [] return ApiParsedDocument(content=content, metadata=metadata, version=dedoc.version.__version__, warnings=self.warnings, attachments=attachments)