Source code for dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader

from typing import List, Optional, Tuple

from dedocutils.data_structures import BBox
from numpy import ndarray

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader


[docs]class PdfTxtlayerReader(PdfBaseReader):
    """
    This class allows to extract content (text, tables, attachments) from the .pdf documents with a textual layer (copyable documents).
    It uses a pdfminer library for content extraction.

    For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`.
    """

    def __init__(self, *, config: Optional[dict] = None) -> None:
        from dedoc.extensions import recognized_extensions, recognized_mimes

        super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format)

        from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
        self.extractor_layer = PdfminerExtractor(config=self.config)
        self.reader_key = "true"

[docs]    def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
        """
        Check if the document extension is suitable for this reader (PDF format is supported only).
        This method returns `True` only when the key `pdf_with_text_layer` with value `true` is set in the dictionary `parameters`.

        You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.

        Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
        """
        from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer
        return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == self.reader_key

[docs]    def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
        return super().read(file_path, parameters)

    def _process_one_page(self,
                          image: ndarray,
                          parameters: ParametersForParseDoc,
                          page_number: int,
                          path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]:
        if parameters.need_pdf_table_analysis:
            gray_image = self._convert_to_gray(image)
            cleaned_image, tables = self.table_recognizer.recognize_tables_from_image(
                image=gray_image,
                page_number=page_number,
                language=parameters.language,
                table_type=parameters.table_type
            )
        else:
            tables = []

        page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, parameters=parameters)
        if page is None:
            return [], [], [], []
        if parameters.need_gost_frame_analysis:
            page_shift = self.gost_frame_boxes[page_number][0]
            self._move_table_cells(tables=tables, page_shift=page_shift, page=self.gost_frame_boxes[page_number][1])
            self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)
            readable_block = page_shift  # bbox representing the content of the gost frame
            page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])]  # exclude boxes outside the frame

        unreadable_blocks = [location.bbox for table in tables for location in table.locations]
        page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)]
        lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False)

        if not parameters.need_gost_frame_analysis:
            self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables)

        return lines, tables, page.attachments, []

    def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple[int, int]) -> None:
        """
        Move tables back to original coordinates when parsing a document containing a gost frame
        """
        image_height, image_width = page
        for table in tables:
            shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left  # shift tables to original coordinates
            for location in table.locations:
                location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
                location.page_height, location.page_width = image_height, image_width
            for row in table.cells:
                for cell in row:
                    cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

    def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None:
        """
        Change table boxes' width height into pdf space like textual lines
        """

        for table in tables:
            for row in table.cells:

                for cell in row:
                    cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height)

    def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool:
        """
        Check obj_bbox inside some unreadable blocks or not
        :param obj_bbox: ["x_top_left", "y_top_left", "width", "height"]
        :param unreadable_blocks: List["x_top_left", "y_top_left", "width", "height"]
        :return: Boolean
        """
        for block in unreadable_blocks:
            if block.have_intersection_with_box(obj_bbox):
                return True
        return False