Source code for dedoc.readers.docx_reader.docx_reader

import logging
import os
from typing import List, Optional

from dedoc.attachments_extractors.concrete_attachments_extractors.docx_attachments_extractor import DocxAttachmentsExtractor
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.readers.base_reader import BaseReader
from dedoc.readers.docx_reader.data_structures.docx_document import DocxDocument


[docs]class DocxReader(BaseReader): """ This class is used for parsing documents with .docx extension. Please use :class:`~dedoc.converters.DocxConverter` for getting docx file from similar formats. """
[docs] def __init__(self, *, config: dict) -> None: """ :param config: configuration of the reader, e.g. logger for logging """ self.attachment_extractor = DocxAttachmentsExtractor() self.logger = config.get("logger", logging.getLogger())
[docs] def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ return extension.lower() in recognized_extensions.docx_like_format or mime in recognized_mimes.docx_like_format
[docs] def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters attachments = self.attachment_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) docx_document = DocxDocument(path=path, attachments=attachments, logger=self.logger) lines = self.__fix_lines(docx_document.lines) return UnstructuredDocument(lines=lines, tables=docx_document.tables, attachments=attachments, warnings=[])
def __fix_lines(self, lines: List[LineWithMeta]) -> List[LineWithMeta]: for i, line in enumerate(lines[1:]): if lines[i].metadata.tag_hierarchy_level != line.metadata.tag_hierarchy_level \ or lines[i].metadata.tag_hierarchy_level.line_type != HierarchyLevel.unknown \ or lines[i].line.endswith("\n"): continue old_len = len(lines[i].line) lines[i].set_line(lines[i].line + "\n") for annotation in lines[i].annotations: if annotation.end == old_len: annotation.end += 1 return lines