Source code for dedoc.readers.pdf_reader.data_classes.tables.scantable

from typing import List

from dedocutils.data_structures import BBox

from dedoc.data_structures.cell_with_meta import CellWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.readers.pdf_reader.data_classes.tables.cell import Cell
from dedoc.readers.pdf_reader.data_classes.tables.location import Location


[docs]class ScanTable(Table): """ Utility class for storing recognized tables from document images. The class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class. """ def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1, page_width: int = None, page_height: int = None) -> None: super().__init__(cells, TableMetadata(page_id=page_number)) self.order = order self.locations = [Location(page_number, bbox, page_width=page_width, page_height=page_height)] def extended(self, table: "ScanTable") -> None: # extend locations self.locations.extend(table.locations) # extend values self.cells.extend(table.cells) # extend order self.order = max(self.order, table.order) def check_on_cell_instance(self) -> bool: if len(self.cells) == 0: return False if len(self.cells[0]) == 0: return False if not isinstance(self.cells[0][0], Cell): return False return True def __get_cells_text(self, cells: List[List[CellWithMeta]]) -> List[List[str]]: return [[cell.get_text() for cell in row] for row in cells] @property def location(self) -> Location: return min(self.locations) @property def uid(self) -> str: return self.metadata.uid def to_dict(self) -> dict: from collections import OrderedDict data_text = self.__get_cells_text(self.cells) res = OrderedDict() res["locations"] = [location.to_dict() for location in self.locations] res["cells"] = data_text return res