Source code for dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader

from typing import List, Optional, Tuple

from dedocutils.data_structures import BBox
from numpy import ndarray

from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader


[docs]class PdfTxtlayerReader(PdfBaseReader): """ This class allows to extract content (text, tables, attachments) from the .pdf documents with a textual layer (copyable documents). It uses a pdfminer library for content extraction. For more information, look to `pdf_with_text_layer` option description in :ref:`pdf_handling_parameters`. """ def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format) from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor self.extractor_layer = PdfminerExtractor(config=self.config) self.reader_key = "true"
[docs] def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader (PDF format is supported only). This method returns `True` only when the key `pdf_with_text_layer` with value `true` is set in the dictionary `parameters`. You can look to :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer return super().can_read(file_path=file_path, mime=mime, extension=extension) and get_param_pdf_with_txt_layer(parameters) == self.reader_key
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: return super().read(file_path, parameters)
def _process_one_page(self, image: ndarray, parameters: ParametersForParseDoc, page_number: int, path: str) -> Tuple[List[LineWithLocation], List[ScanTable], List[PdfImageAttachment], List[float]]: if parameters.need_pdf_table_analysis: gray_image = self._convert_to_gray(image) cleaned_image, tables = self.table_recognizer.recognize_tables_from_image( image=gray_image, page_number=page_number, language=parameters.language, table_type=parameters.table_type ) else: tables = [] page = self.extractor_layer.extract_text_layer(path=path, page_number=page_number, parameters=parameters) if page is None: return [], [], [], [] if parameters.need_gost_frame_analysis: page_shift = self.gost_frame_boxes[page_number][0] self._move_table_cells(tables=tables, page_shift=page_shift, page=self.gost_frame_boxes[page_number][1]) self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables) readable_block = page_shift # bbox representing the content of the gost frame page.bboxes = [bbox for bbox in page.bboxes if self._inside_any_unreadable_block(bbox.bbox, [readable_block])] # exclude boxes outside the frame unreadable_blocks = [location.bbox for table in tables for location in table.locations] page.bboxes = [bbox for bbox in page.bboxes if not self._inside_any_unreadable_block(bbox.bbox, unreadable_blocks)] lines = self.metadata_extractor.extract_metadata_and_set_annotations(page_with_lines=page, call_classifier=False) if not parameters.need_gost_frame_analysis: self.__change_table_boxes_page_width_heigth(pdf_width=page.pdf_page_width, pdf_height=page.pdf_page_height, tables=tables) return lines, tables, page.attachments, [] def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tuple[int, int]) -> None: """ Move tables back to original coordinates when parsing a document containing a gost frame """ image_height, image_width = page for table in tables: shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates for location in table.locations: location.bbox.shift(shift_x=shift_x, shift_y=shift_y) location.page_height, location.page_width = image_height, image_width for row in table.cells: for cell in row: cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height) def __change_table_boxes_page_width_heigth(self, pdf_width: int, pdf_height: int, tables: List[ScanTable]) -> None: """ Change table boxes' width height into pdf space like textual lines """ for table in tables: for row in table.cells: for cell in row: cell.change_lines_boxes_page_width_height(new_page_width=pdf_width, new_page_height=pdf_height) def _inside_any_unreadable_block(self, obj_bbox: BBox, unreadable_blocks: List[BBox]) -> bool: """ Check obj_bbox inside some unreadable blocks or not :param obj_bbox: ["x_top_left", "y_top_left", "width", "height"] :param unreadable_blocks: List["x_top_left", "y_top_left", "width", "height"] :return: Boolean """ for block in unreadable_blocks: if block.have_intersection_with_box(obj_bbox): return True return False