Source code for dedoc.readers.docx_reader.docx_reader

from typing import List, Optional

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class DocxReader(BaseReader): """ This class is used for parsing documents with .docx extension. Please use :class:`~dedoc.converters.DocxConverter` for getting docx file from similar formats. """ def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.docx_like_format, recognized_mimes=recognized_mimes.docx_like_format) self.attachment_extractor = DocxAttachmentsExtractor(config=self.config)
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument from dedoc.utils.parameter_utils import get_param_with_attachments with_attachments = get_param_with_attachments(parameters) attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] docx_document = DocxDocument(path=file_path, attachments=attachments, logger=self.logger) lines = self.__fix_lines(docx_document.lines) return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[])
def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: from dedoc.data_structures.hierarchy_level import HierarchyLevel for i, line in enumerate(lines[1:]): if lines[i].metadata.tag_hierarchy_level != line.metadata.tag_hierarchy_level \ or lines[i].metadata.tag_hierarchy_level.line_type != HierarchyLevel.unknown \ or lines[i].line.endswith("\n"): continue old_len = len(lines[i].line) lines[i].set_line(lines[i].line + "\n") for annotation in lines[i].annotations: if annotation.end == old_len: annotation.end += 1 return lines