Source code for dedoc.readers.pptx_reader.pptx_reader

import os
from typing import Optional

from pptx import Presentation

from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.extensions import recognized_extensions, recognized_mimes
from dedoc.readers.base_reader import BaseReader


[docs]class PptxReader(BaseReader): """ This class is used for parsing documents with .pptx extension. Please use :class:`~dedoc.converters.PptxConverter` for getting pptx file from similar formats. """
[docs] def __init__(self) -> None: self.attachments_extractor = PptxAttachmentsExtractor()
[docs] def can_read(self, path: str, mime: str, extension: str, document_type: str = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ return extension.lower() in recognized_extensions.pptx_like_format or mime in recognized_mimes.pptx_like_format
[docs] def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters prs = Presentation(path) lines, tables = [], [] for page_id, slide in enumerate(prs.slides, start=1): for paragraph_id, shape in enumerate(slide.shapes, start=1): if shape.has_text_frame: lines.append(LineWithMeta(line=shape.text, metadata=LineMetadata(page_id=page_id, line_id=paragraph_id), annotations=[])) if shape.has_table: cells = [[cell.text for cell in row.cells] for row in shape.table.rows] tables.append(Table(cells=cells, metadata=TableMetadata(page_id=page_id))) attachments = self.attachments_extractor.get_attachments(tmpdir=os.path.dirname(path), filename=os.path.basename(path), parameters=parameters) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])