Source code for dedoc.readers.html_reader.html_reader

from typing import List, Optional, Tuple, Union

from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag

from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class HtmlReader(BaseReader):
    """
    This reader allows to handle documents with the following extensions: .htm, .html, .shtml
    """

    def __init__(self, *, config: Optional[dict] = None) -> None:
        from dedoc.extensions import recognized_extensions, recognized_mimes
        from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing
        from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser

        super().__init__(config=config, recognized_extensions=recognized_extensions.html_like_format, recognized_mimes=recognized_mimes.html_like_format)
        self.postprocessor = HtmlLinePostprocessing()
        self.tag_annotation_parser = HtmlTagAnnotationParser()

[docs]    def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
        """
        The method return document content with all document's lines and tables, attachments remain empty.
        This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
        Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
        """
        from dedoc.utils.utils import calculate_file_hash

        parameters = {} if parameters is None else parameters
        with open(file_path, "rb") as f:
            soup = BeautifulSoup(f.read(), "html.parser")

        handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true"
        filepath_hash = calculate_file_hash(path=file_path)
        lines = self.__read_blocks(soup, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table)
        tables = [
            self._read_table(table, filepath_hash) for table in soup.find_all("table") if self._visible_table(table,
                                                                                                              handle_invisible_table=handle_invisible_table)
        ]
        document = UnstructuredDocument(tables=tables, lines=lines, attachments=[])
        document_postprocess = self.postprocessor.postprocess(document)
        return document_postprocess

    def __handle_block(self, tag: Union[Tag], filepath_hash: str, handle_invisible_table: bool, table: Optional[bool] = False,
                       uid: Optional[str] = "") -> List[LineWithMeta]:
        import hashlib
        from dedoc.readers.html_reader.html_tags import HtmlTags

        tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest()
        assert isinstance(tag, (Tag, str))
        if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table):
            block_lines = []
        elif tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table):
            # if table is invisible and we don't parse invisible tables (handle_invisible_table == False)
            # then we parse table as raw text
            block_lines = self.__handle_invisible_table(block=tag, filepath_hash=filepath_hash, uid=tag_uid)
        elif isinstance(tag, str):
            block_lines = self._handle_text_line(block=tag, filepath_hash=filepath_hash, uid=tag_uid)
        elif tag.name not in HtmlTags.available_tags:
            self.logger.debug(f"skip tag {tag.name.encode()}")
            block_lines = []
        elif tag.name in HtmlTags.special_symbol_tags:
            tag_value = HtmlTags.special_symbol_tags[tag.name]
            block_lines = self._handle_text_line(block=tag_value, filepath_hash=filepath_hash, uid=tag_uid, ignore_space=False)
        elif tag.name in HtmlTags.block_tags:
            block_lines = self.__read_blocks(block=tag, filepath_hash=filepath_hash, uid=tag_uid)
        elif tag.name in HtmlTags.list_tags:
            block_lines = self.__read_list(lst=tag, uid=tag_uid, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table)
        else:
            block_lines = self.__handle_single_tag(tag=tag, filepath_hash=filepath_hash, uid=tag_uid, table=table)
        for line in block_lines:
            if not getattr(line.metadata, "html_tag", None):
                line.metadata.html_tag = tag.name
        return block_lines

    def __handle_single_tag(self, tag: Tag, filepath_hash: str, uid: str, table: Optional[bool] = False) -> List[LineWithMeta]:
        import hashlib
        from dedoc.data_structures.hierarchy_level import HierarchyLevel
        from dedoc.readers.html_reader.html_tags import HtmlTags

        text = self.__get_text(tag, table)

        if not text or text.isspace():
            return []

        annotations = self.tag_annotation_parser.parse(tag=tag)
        header_level = int(tag.name[1:]) if tag.name in HtmlTags.header_tags else 0
        line_type = HierarchyLevel.unknown if header_level == 0 else HierarchyLevel.header
        tag_uid = hashlib.md5((uid + text).encode()).hexdigest()
        line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, filepath_hash=filepath_hash, annotations=annotations)
        line.metadata.html_tag = tag.name
        return [line]

    def __read_blocks(self, block: Tag, filepath_hash: str = "", handle_invisible_table: bool = False, table: Optional[bool] = False,
                      uid: Optional[str] = "") -> List[LineWithMeta]:
        import hashlib

        tag_uid = hashlib.md5((filepath_hash + uid + str(block.name)).encode()).hexdigest()
        if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table):
            return []

        lines = []

        for tag in block:
            assert isinstance(tag, (Tag, str))
            block_lines = self.__handle_block(tag=tag, filepath_hash=filepath_hash, handle_invisible_table=handle_invisible_table, table=table, uid=tag_uid)
            lines.extend(block_lines)
        return lines

    def _handle_text_line(self, block: str, filepath_hash: str, uid: str, ignore_space: bool = True) -> List[LineWithMeta]:
        import hashlib
        from dedoc.data_structures.hierarchy_level import HierarchyLevel

        if not block.strip() and ignore_space:
            return []
        tag_uid = hashlib.md5((uid + block).encode()).hexdigest()
        line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=tag_uid, filepath_hash=filepath_hash)
        return [line]

    def __make_line(self, line: str, line_type: str, header_level: int = 0, uid: str = None, filepath_hash: str = None,
                    annotations: List = None) -> LineWithMeta:
        from dedoc.data_structures.hierarchy_level import HierarchyLevel
        from dedoc.data_structures.line_metadata import LineMetadata

        if annotations is None:
            annotations = []

        level = None if header_level == 0 else HierarchyLevel(1, header_level, False, line_type=line_type)
        metadata = LineMetadata(page_id=0, line_id=None, tag_hierarchy_level=level)  # TODO line_id

        uid = f"{filepath_hash}_{uid}"
        return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid)

    def __get_li_header(self, list_type: str, index: int) -> LineWithMeta:
        import string
        from dedoc.data_structures.hierarchy_level import HierarchyLevel
        from dedoc.data_structures.line_metadata import LineMetadata

        end = ") " if list_type in ["a", "A"] else ". "
        if list_type == "":
            header = ""

        elif list_type in ["a", "A"]:
            alphabet = string.ascii_lowercase if list_type == "a" else string.ascii_uppercase
            header = alphabet[index % len(alphabet)]

            while index >= len(alphabet):
                index = index // len(alphabet) - 1
                header = alphabet[index % len(alphabet)] + header

            header = header + end
        else:
            header = str(index + 1) + end
        metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(2, 1, False, line_type=HierarchyLevel.list_item), page_id=0, line_id=0)
        header_line = LineWithMeta(line=header, metadata=metadata)
        return header_line

    def __read_list(self, lst: Tag, uid: str, filepath_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
        import hashlib
        from dedoc.readers.html_reader.html_tags import HtmlTags

        tag_uid = hashlib.md5((uid + str(lst.name)).encode()).hexdigest()
        lines = []
        list_type = lst.get("type", "1" if lst.name in HtmlTags.ordered_list else "")
        item_index = 0

        for item in lst:
            if item.name in HtmlTags.list_items:
                item_lines = self.__handle_list_item(item=item,
                                                     item_index=item_index,
                                                     list_type=list_type,
                                                     filepath_hash=filepath_hash,
                                                     uid=tag_uid,
                                                     handle_invisible_table=handle_invisible_table)
                item_index += 1
                lines.extend(item_lines)
        return lines

    def __handle_list_item(self, item: Tag, item_index: int, list_type: str, filepath_hash: str, uid: str, handle_invisible_table: bool) -> List[LineWithMeta]:
        import hashlib

        tag_uid = hashlib.md5((uid + str(item.name)).encode()).hexdigest()
        lines = []
        header_line = self.__get_li_header(list_type=list_type, index=item_index)
        block_lines = self.__handle_block(item, filepath_hash=filepath_hash, uid=tag_uid, handle_invisible_table=handle_invisible_table)
        hl_depth = header_line.metadata.tag_hierarchy_level.level_1
        for line in block_lines:
            if line.metadata.tag_hierarchy_level.is_unknown():
                header_line += line
            else:
                # Handle complex and nested lists
                lines.append(header_line)
                line.metadata.tag_hierarchy_level.level_1 += hl_depth
                header_line = line
        lines.append(header_line)
        return lines

    # not currently used, but may be useful in the future
    def __get_text(self, tag: Tag, table: Optional[bool] = False) -> [str, int, int]:
        for br in tag.find_all("br"):
            br.replace_with("\n")
        text = tag.getText() + "\n" if tag.name == "p" and not table else tag.getText()
        text = "" if text is None else text
        return text

    def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bool:
        """
        check if given tag is a content tag
        @param tag: html tag
        @param handle_invisible_table: is invisibly table should be handled as table
        @return: True if tag is a content tag False otherwise.
        """
        from dedoc.readers.html_reader.html_tags import HtmlTags

        if tag.name in HtmlTags.service_tags:
            return False
        if tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table):
            return True
        return not isinstance(tag, Doctype) and not isinstance(tag, Comment)

    def __handle_invisible_table(self, block: Tag, filepath_hash: str, uid: str) -> List[LineWithMeta]:
        import hashlib
        from dedoc.data_structures.hierarchy_level import HierarchyLevel

        result = []
        rows = self._read_table(block, filepath_hash).cells
        for row in rows:
            text = "\t".join([cell.get_text() for cell in row])
            if text.strip() != "":
                tag_uid = hashlib.md5((uid + text).encode()).hexdigest()
                line = self.__make_line(line=text, line_type=HierarchyLevel.unknown, uid=tag_uid, filepath_hash=filepath_hash)
                result.append(line)
        return result

    def __clone_cell(self, el: Tuple[Tag, NavigableString]) -> Tuple[Tag, NavigableString]:
        from dedoc.readers.html_reader.html_tags import HtmlTags

        if isinstance(el, NavigableString):
            return type(el)(el)

        copy = Tag(None, el.builder, el.name, el.namespace, el.nsprefix)
        if el.name in HtmlTags.table_cells:
            el_attrs = el.attrs
            copy.hidden = True
            copy.attrs = dict(el_attrs)
            copy.attrs["colspan"] = 1
            copy.attrs["rowspan"] = 1
        for child in el.contents:
            copy.append(self.__clone_cell(child))
        return copy

    def __split_table_cells(self, table: Tag, table_list: List[List[Tag]]) -> None:
        from dedoc.readers.html_reader.html_tags import HtmlTags

        for row_index, row in enumerate(table.find_all(HtmlTags.table_rows)):
            for cell_index, cell in enumerate(row.find_all(HtmlTags.table_cells)):
                cell_rowspan = int(cell.attrs.get("rowspan", 1))
                cell_colspan = int(cell.attrs.get("colspan", 1))
                if cell_rowspan > 1 or cell_colspan > 1:
                    cell_copy = self.__clone_cell(cell)
                    table_list[row_index][cell_index + 1:cell_index + 1] = [cell_copy] * (cell_colspan - 1)
                    for index in range(row_index + 1, row_index + cell_rowspan):
                        table_list[index][cell_index:cell_index] = [cell_copy] * cell_colspan

    def __fix_table(self, table: Tag) -> List[List[Tag]]:
        from dedoc.readers.html_reader.html_tags import HtmlTags

        table_list = []

        # create table list
        for row in table.find_all(HtmlTags.table_rows):
            row_line = []
            for cell in row.find_all(HtmlTags.table_cells):
                row_line.append(cell)
            table_list.append(row_line)

        self.__split_table_cells(table, table_list)

        return table_list

    def _read_table(self, table: Tag, filepath_hash: str) -> Table:
        from dedoc.data_structures.cell_with_meta import CellWithMeta
        from dedoc.data_structures.table_metadata import TableMetadata

        cells_with_meta = []
        fixed_table = self.__fix_table(table)

        for row in fixed_table:
            row_lines = []
            for cell in row:
                cell_with_meta = CellWithMeta(
                    lines=self.__read_blocks(block=cell, filepath_hash=filepath_hash, handle_invisible_table=False, table=True),  # read each cell as a block
                    colspan=int(cell.attrs.get("colspan", 1)),
                    rowspan=int(cell.attrs.get("rowspan", 1)),
                    invisible=cell.hidden if cell.hidden else False
                )
                row_lines.append(cell_with_meta)
            cells_with_meta.append(row_lines)

        return Table(cells=cells_with_meta, metadata=TableMetadata(page_id=0))

    def _visible_table(self, table: Tag, handle_invisible_table: bool) -> bool:
        if handle_invisible_table:
            return True
        assert table.name == "table", f"block {table} is not table"
        for td in table.find_all("td"):
            style = td.attrs.get("style", "")
            if "border-bottom-style:solid" in style or "border-top-style:solid" in style:
                return True
        return table.attrs.get("border", "0") != "0"