Source code for dedoc.readers.html_reader.html_reader

from typing import List, Optional, Tuple, Union

from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class HtmlReader(BaseReader): """ This reader allows to handle documents with the following extensions: .htm, .html, .shtml """ def __init__(self, *, config: Optional[dict] = None) -> None: from dedoc.extensions import recognized_extensions, recognized_mimes from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser super().__init__(config=config, recognized_extensions=recognized_extensions.html_like_format, recognized_mimes=recognized_mimes.html_like_format) self.postprocessor = HtmlLinePostprocessing() self.tag_annotation_parser = HtmlTagAnnotationParser()
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ The method return document content with all document's lines and tables, attachments remain empty. This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ from dedoc.utils.utils import calculate_file_hash parameters = {} if parameters is None else parameters with open(file_path, "rb") as f: soup = BeautifulSoup(f.read(), "html.parser") handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true" filepath_hash = calculate_file_hash(path=file_path) lines = self.__read_blocks(soup, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table) tables = [ self._read_table(table, filepath_hash) for table in soup.find_all("table") if self._visible_table(table, handle_invisible_table=handle_invisible_table) ] document = UnstructuredDocument(tables=tables, lines=lines, attachments=[]) document_postprocess = self.postprocessor.postprocess(document) return document_postprocess
def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_table: bool, table: Optional[bool] = False, uid: Optional[str] = "") -> List[LineWithMeta]: import hashlib from dedoc.readers.html_reader.html_tags import HtmlTags tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest() assert isinstance(tag, (Tag, str)) if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table): block_lines = [] elif tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table): # if table is invisible and we don't parse invisible tables (handle_invisible_table == False) # then we parse table as raw text block_lines = self.__handle_invisible_table(block=tag, filepath_hash=filepath_hash, uid=tag_uid) elif isinstance(tag, str): block_lines = self._handle_text_line(block=tag, filepath_hash=filepath_hash, uid=tag_uid) elif tag.name not in HtmlTags.available_tags: self.logger.debug(f"skip tag {tag.name.encode()}") block_lines = [] elif tag.name in HtmlTags.special_symbol_tags: tag_value = HtmlTags.special_symbol_tags[tag.name] block_lines = self._handle_text_line(block=tag_value, filepath_hash=filepath_hash, uid=tag_uid, ignore_space=False) elif tag.name in HtmlTags.block_tags: block_lines = self.__read_blocks(block=tag, filepath_hash=filepath_hash, uid=tag_uid) elif tag.name in HtmlTags.list_tags: block_lines = self.__read_list(lst=tag, uid=tag_uid, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table) else: block_lines = self.__handle_single_tag(tag=tag, filepath_hash=filepath_hash, uid=tag_uid, table=table) for line in block_lines: if not getattr(line.metadata, "html_tag", None): line.metadata.html_tag = tag.name return block_lines def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]: import hashlib from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.readers.html_reader.html_tags import HtmlTags text = self.__get_text(tag, table) if not text or text.isspace(): return [] annotations = self.tag_annotation_parser.parse(tag=tag) header_level = int(tag.name[1:]) if tag.name in HtmlTags.header_tags else 0 line_type = HierarchyLevel.unknown if header_level == 0 else HierarchyLevel.header tag_uid = hashlib.md5((uid + text).encode()).hexdigest() line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, filepath_hash=filepath_hash, annotations=annotations) line.metadata.html_tag = tag.name return [line] def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False, uid: Optional[str] = "") -> List[LineWithMeta]: import hashlib tag_uid = hashlib.md5((filepath_hash + uid + str(block.name)).encode()).hexdigest() if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table): return [] lines = [] for tag in block: assert isinstance(tag, (Tag, str)) block_lines = self.__handle_block(tag=tag, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table, table=table, uid=tag_uid) lines.extend(block_lines) return lines def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_space: bool = True) -> List[LineWithMeta]: import hashlib from dedoc.data_structures.hierarchy_level import HierarchyLevel if not block.strip() and ignore_space: return [] tag_uid = hashlib.md5((uid + block).encode()).hexdigest() line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=tag_uid, filepath_hash=filepath_hash) return [line] def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, filepath_hash: str = None, annotations: List = None) -> LineWithMeta: from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata if annotations is None: annotations = [] level = None if header_level == 0 else HierarchyLevel(1, header_level, False, line_type=line_type) metadata = LineMetadata(page_id=0, line_id=None, tag_hierarchy_level=level) # TODO line_id uid = f"{filepath_hash}_{uid}" return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid) def __get_li_header(self, list_type: str, index: int) -> LineWithMeta: import string from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata end = ") " if list_type in ["a", "A"] else ". " if list_type == "": header = "" elif list_type in ["a", "A"]: alphabet = string.ascii_lowercase if list_type == "a" else string.ascii_uppercase header = alphabet[index % len(alphabet)] while index >= len(alphabet): index = index // len(alphabet) - 1 header = alphabet[index % len(alphabet)] + header header = header + end else: header = str(index + 1) + end metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(2, 1, False, line_type=HierarchyLevel.list_item), page_id=0, line_id=0) header_line = LineWithMeta(line=header, metadata=metadata) return header_line def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]: import hashlib from dedoc.readers.html_reader.html_tags import HtmlTags tag_uid = hashlib.md5((uid + str(lst.name)).encode()).hexdigest() lines = [] list_type = lst.get("type", "1" if lst.name in HtmlTags.ordered_list else "") item_index = 0 for item in lst: if item.name in HtmlTags.list_items: item_lines = self.__handle_list_item(item=item, item_index=item_index, list_type=list_type, filepath_hash=filepath_hash, uid=tag_uid, handle_invisible_table=handle_invisible_table) item_index += 1 lines.extend(item_lines) return lines def __handle_list_item(self, item: Tag, item_index: int, list_type: str, filepath_hash: str, uid: str, handle_invisible_table: bool) -> List[LineWithMeta]: import hashlib tag_uid = hashlib.md5((uid + str(item.name)).encode()).hexdigest() lines = [] header_line = self.__get_li_header(list_type=list_type, index=item_index) block_lines = self.__handle_block(item, filepath_hash=filepath_hash, uid=tag_uid, handle_invisible_table=handle_invisible_table) hl_depth = header_line.metadata.tag_hierarchy_level.level_1 for line in block_lines: if line.metadata.tag_hierarchy_level.is_unknown(): header_line += line else: # Handle complex and nested lists lines.append(header_line) line.metadata.tag_hierarchy_level.level_1 += hl_depth header_line = line lines.append(header_line) return lines # not currently used, but may be useful in the future def __get_text(self, tag: Tag, table: Optional[bool] = False) -> [str, int, int]: for br in tag.find_all("br"): br.replace_with("\n") text = tag.getText() + "\n" if tag.name == "p" and not table else tag.getText() text = "" if text is None else text return text def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bool: """ check if given tag is a content tag @param tag: html tag @param handle_invisible_table: is invisibly table should be handled as table @return: True if tag is a content tag False otherwise. """ from dedoc.readers.html_reader.html_tags import HtmlTags if tag.name in HtmlTags.service_tags: return False if tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table): return True return not isinstance(tag, Doctype) and not isinstance(tag, Comment) def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> List[LineWithMeta]: import hashlib from dedoc.data_structures.hierarchy_level import HierarchyLevel result = [] rows = self._read_table(block, filepath_hash).cells for row in rows: text = "\t".join([cell.get_text() for cell in row]) if text.strip() != "": tag_uid = hashlib.md5((uid + text).encode()).hexdigest() line = self.__make_line(line=text, line_type=HierarchyLevel.unknown, uid=tag_uid, filepath_hash=filepath_hash) result.append(line) return result def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableString]: from dedoc.readers.html_reader.html_tags import HtmlTags if isinstance(el, NavigableString): return type(el)(el) copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix) if el.name in HtmlTags.table_cells: el_attrs = el.attrs copy.hidden = True copy.attrs = dict(el_attrs) copy.attrs["colspan"] = 1 copy.attrs["rowspan"] = 1 for child in el.contents: copy.append(self.__clone_cell(child)) return copy def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None: from dedoc.readers.html_reader.html_tags import HtmlTags for row_index, row in enumerate(table.find_all(HtmlTags.table_rows)): for cell_index, cell in enumerate(row.find_all(HtmlTags.table_cells)): cell_rowspan = int(cell.attrs.get("rowspan", 1)) cell_colspan = int(cell.attrs.get("colspan", 1)) if cell_rowspan > 1 or cell_colspan > 1: cell_copy = self.__clone_cell(cell) table_list[row_index][cell_index + 1:cell_index + 1] = [cell_copy] * (cell_colspan - 1) for index in range(row_index + 1, row_index + cell_rowspan): table_list[index][cell_index:cell_index] = [cell_copy] * cell_colspan def __fix_table(self, table: Tag) -> List[List[Tag]]: from dedoc.readers.html_reader.html_tags import HtmlTags table_list = [] # create table list for row in table.find_all(HtmlTags.table_rows): row_line = [] for cell in row.find_all(HtmlTags.table_cells): row_line.append(cell) table_list.append(row_line) self.__split_table_cells(table, table_list) return table_list def _read_table(self, table: Tag, filepath_hash: str) -> Table: from dedoc.data_structures.cell_with_meta import CellWithMeta from dedoc.data_structures.table_metadata import TableMetadata cells_with_meta = [] fixed_table = self.__fix_table(table) for row in fixed_table: row_lines = [] for cell in row: cell_with_meta = CellWithMeta( lines=self.__read_blocks(block=cell, filepath_hash=filepath_hash, handle_invisible_table=False, table=True), # read each cell as a block colspan=int(cell.attrs.get("colspan", 1)), rowspan=int(cell.attrs.get("rowspan", 1)), invisible=cell.hidden if cell.hidden else False ) row_lines.append(cell_with_meta) cells_with_meta.append(row_lines) return Table(cells=cells_with_meta, metadata=TableMetadata(page_id=0)) def _visible_table(self, table: Tag, handle_invisible_table: bool) -> bool: if handle_invisible_table: return True assert table.name == "table", f"block {table} is not table" for td in table.find_all("td"): style = td.attrs.get("style", "") if "border-bottom-style:solid" in style or "border-top-style:solid" in style: return True return table.attrs.get("border", "0") != "0"