Source code for dedoc.readers.pptx_reader.pptx_reader

from typing import Dict, List, Optional

from bs4 import BeautifulSoup, Tag

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader
from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor


[docs]class PptxReader(BaseReader): """ This class is used for parsing documents with .pptx extension. Please use :class:`~dedoc.converters.PptxConverter` for getting pptx file from similar formats. """ def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.attachments_extractors.concrete_attachments_extractors.pptx_attachments_extractor import PptxAttachmentsExtractor from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor super().__init__(config=config, recognized_extensions=recognized_extensions.pptx_like_format, recognized_mimes=recognized_mimes.pptx_like_format) self.attachments_extractor = PptxAttachmentsExtractor(config=self.config) self.numbering_extractor = NumberingExtractor()
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines, tables and attachments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ from dedoc.data_structures.line_metadata import LineMetadata from dedoc.readers.pptx_reader.shape import PptxShape from dedoc.utils.parameter_utils import get_param_with_attachments with_attachments = get_param_with_attachments(parameters) attachments = self.attachments_extractor.extract(file_path=file_path, parameters=parameters) if with_attachments else [] attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments} images_rels = self.__get_slide_images_rels(file_path) properties_extractor = PropertiesExtractor(file_path) slide_xml_list = self.__get_slides_bs(file_path, xml_prefix="ppt/slides/slide", xml_postfix=".xml") lines = [] tables = [] for slide_id, slide_xml in enumerate(slide_xml_list): shape_tree_xml = slide_xml.spTree is_first_shape = True for tag in shape_tree_xml: if tag.name == "sp": if not tag.txBody: continue shape = PptxShape(tag, page_id=slide_id, init_line_id=len(lines), numbering_extractor=self.numbering_extractor, properties_extractor=properties_extractor, is_title=is_first_shape) shape_lines = shape.get_lines() lines.extend(shape_lines) if is_first_shape and len(shape_lines) > 0: is_first_shape = False elif tag.tbl: self.__add_table(lines=lines, tables=tables, page_id=slide_id, table_xml=tag.tbl, properties_extractor=properties_extractor) elif tag.name == "pic" and tag.blip: if len(lines) == 0: lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=slide_id, line_id=0))) image_rel_id = str(slide_id) + tag.blip.get("r:embed", "") self.__add_attach_annotation(lines[-1], image_rel_id, attachment_name2uid, images_rels) return UnstructuredDocument(lines=lines, tables=tables, attachments=attachments, warnings=[])
def __get_slides_bs(self, path: str, xml_prefix: str, xml_postfix: str) -> List[BeautifulSoup]: import zipfile from dedoc.utils.office_utils import get_bs_from_zip with zipfile.ZipFile(path) as document: xml_names = document.namelist() filtered_names = [file_name for file_name in xml_names if file_name.startswith(xml_prefix) and file_name.endswith(xml_postfix)] sorted_names = sorted(filtered_names, key=lambda x: int(x[len(xml_prefix):-len(xml_postfix)])) slides_bs_list = [get_bs_from_zip(path, file_name, remove_spaces=True) for file_name in sorted_names] return slides_bs_list def __get_slide_images_rels(self, path: str) -> Dict[str, str]: """ return mapping: {image Id -> image name} """ rels_xml_list = self.__get_slides_bs(path, xml_prefix="ppt/slides/_rels/slide", xml_postfix=".xml.rels") images_dir = "../media/" images_rels = dict() for slide_id, rels_xml in enumerate(rels_xml_list): for rel in rels_xml.find_all("Relationship"): if rel["Target"].startswith(images_dir): images_rels[str(slide_id) + rel["Id"]] = rel["Target"][len(images_dir):] return images_rels def __add_table(self, lines: List[LineWithMeta], tables: List[Table], page_id: int, table_xml: Tag, properties_extractor: PropertiesExtractor) -> None: from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation from dedoc.data_structures.line_metadata import LineMetadata from dedoc.readers.pptx_reader.table import PptxTable table = PptxTable(table_xml, page_id, self.numbering_extractor, properties_extractor).to_table() if len(lines) == 0: lines.append(LineWithMeta(line="", metadata=LineMetadata(page_id=page_id, line_id=0))) lines[-1].annotations.append(TableAnnotation(start=0, end=len(lines[-1]), value=table.metadata.uid)) tables.append(table) def __add_attach_annotation(self, line: LineWithMeta, image_rel_id: str, attachment_name2uid: dict, images_rels: dict) -> None: from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation try: image_name = images_rels[image_rel_id] image_uid = attachment_name2uid[image_name] line.annotations.append(AttachAnnotation(start=0, end=len(line), attach_uid=image_uid)) except KeyError as e: self.logger.warning(f"Attachment key hasn't been found ({e})")