Source code for dedoc.readers.excel_reader.excel_reader

from typing import Optional

from xlrd.sheet import Sheet

from dedoc.data_structures.table import Table
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class ExcelReader(BaseReader): """ This class is used for parsing documents with .xlsx extension. Please use :class:`~dedoc.converters.ExcelConverter` for getting xlsx file from similar formats. """ import xlrd xlrd.xlsx.ensure_elementtree_imported(False, None) xlrd.xlsx.Element_has_iter = True def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.attachments_extractors.concrete_attachments_extractors.excel_attachments_extractor import ExcelAttachmentsExtractor from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.excel_like_format, recognized_mimes=recognized_mimes.excel_like_format) self.attachment_extractor = ExcelAttachmentsExtractor(config=self.config)
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method extracts tables and attachments from the document, `lines` attribute remains empty. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ import xlrd from dedoc.utils.parameter_utils import get_param_with_attachments with xlrd.open_workbook(file_path) as book: sheets_num = book.nsheets tables = [] for sheet_num in range(sheets_num): sheet = book.sheet_by_index(sheet_num) tables.append(self.__parse_sheet(sheet_num, sheet)) if get_param_with_attachments(parameters): attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters) else: attachments = [] return UnstructuredDocument(lines=[], tables=tables, attachments=attachments, warnings=[])
def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table: from dedoc.data_structures.line_with_meta import LineWithMeta from dedoc.data_structures.line_metadata import LineMetadata from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.table_metadata import TableMetadata n_rows = sheet.nrows n_cols = sheet.ncols res = [] for row_id in range(n_rows): row = [] for col_id in range(n_cols): value = str(sheet.cell_value(rowx=row_id, colx=col_id)) row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))])) res.append(row) metadata = TableMetadata(page_id=sheet_id) return Table(cells=res, metadata=metadata)