Source code for dedoc.readers.html_reader.html_reader

import hashlib
import logging
import string
import uuid
from typing import List, Optional, Union

from bs4 import BeautifulSoup
from bs4 import Comment, Doctype, Tag

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.table_metadata import TableMetadata
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader
from dedoc.readers.html_reader.html_line_postprocessing import HtmlLinePostprocessing
from dedoc.readers.html_reader.html_tag_annotation_parser import HtmlTagAnnotationParser
from dedoc.readers.html_reader.html_tags import HtmlTags
from dedoc.utils.utils import calculate_file_hash


[docs]class HtmlReader(BaseReader):
    """
    This reader allows to handle documents with the following extensions: .html, .shtml
    """
[docs]    def __init__(self, *, config: dict) -> None:
        """
        :param config: configuration of the reader, e.g. logger for logging
        """
        self.config = config
        self.logger = config.get("logger", logging.getLogger())
        self.postprocessor = HtmlLinePostprocessing()
        self.tag_annotation_parser = HtmlTagAnnotationParser()

[docs]    def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
        """
        Check if the document extension is suitable for this reader.
        Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
        """
        return extension.lower() in [".html", ".shtml"] or mime in ["text/html"]

[docs]    def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument:
        """
        The method return document content with all document's lines and tables, attachments remain empty.
        This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
        Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
        """
        parameters = {} if parameters is None else parameters
        with open(path, "rb") as f:
            soup = BeautifulSoup(f.read(), "html.parser")

        handle_invisible_table = str(parameters.get("handle_invisible_table", "false")).lower() == "true"
        path_hash = calculate_file_hash(path=path)
        lines = self.__read_blocks(soup, path_hash=path_hash, handle_invisible_table=handle_invisible_table)
        tables = [self._read_table(table) for table in soup.find_all("table")
                  if self._visible_table(table, handle_invisible_table=handle_invisible_table)]
        document = UnstructuredDocument(tables=tables, lines=lines, attachments=[])
        document_postprocess = self.postprocessor.postprocess(document)
        return document_postprocess

    def __handle_block(self, tag: Union[Tag], uid: str, handle_invisible_table: bool) -> List[LineWithMeta]:
        tag_uid = hashlib.md5((uid + str(tag.name)).encode()).hexdigest()
        assert isinstance(tag, (Tag, str))
        if not self.__is_content_tag(tag, handle_invisible_table=handle_invisible_table):
            block_lines = []
        elif tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table):
            block_lines = self.__handle_invisible_table(block=tag, path_hash=uid)
        elif isinstance(tag, str):
            block_lines = self._handle_text_line(block=tag, path_hash=uid)
        elif tag.name not in HtmlTags.available_tags:
            self.logger.debug(f"skip tag {tag.name.encode()}")
            block_lines = []
        elif tag.name in HtmlTags.special_symbol_tags:
            tag_value = HtmlTags.special_symbol_tags[tag.name]
            block_lines = self._handle_text_line(block=tag_value, path_hash=uid, ignore_space=False)
        elif tag.name in HtmlTags.block_tags:
            block_lines = self.__read_blocks(block=tag, path_hash=uid)
        elif tag.name in HtmlTags.list_tags:
            block_lines = self.__read_list(lst=tag, uid=tag_uid, path_hash=uid, handle_invisible_table=handle_invisible_table)
        else:
            block_lines = self.__handle_single_tag(tag, uid)
        for line in block_lines:
            if not getattr(line.metadata, "html_tag", None):
                line.metadata.extend_other_fields({"html_tag": tag.name})
        return block_lines

    def __handle_single_tag(self, tag: Tag, uid: str) -> List[LineWithMeta]:
        text = self.__get_text(tag)

        if not text or text.isspace():
            return []

        annotations = self.tag_annotation_parser.parse(tag=tag)
        header_level = int(tag.name[1:]) if tag.name in HtmlTags.header_tags else 0
        line_type = HierarchyLevel.unknown if header_level == 0 else HierarchyLevel.header
        tag_uid = hashlib.md5((uid + text).encode()).hexdigest()
        line = self.__make_line(line=text, line_type=line_type, header_level=header_level, uid=tag_uid, path_hash=uid, annotations=annotations)
        line.metadata.extend_other_fields({"html_tag": tag.name})
        return [line]

    def __read_blocks(self,
                      block: Tag,
                      path_hash: str = "",
                      handle_invisible_table: bool = False) -> List[LineWithMeta]:
        uid = hashlib.md5((path_hash + str(block.name)).encode()).hexdigest()
        if not self.__is_content_tag(block, handle_invisible_table=handle_invisible_table):
            return []

        lines = []

        for tag in block:
            assert isinstance(tag, (Tag, str))
            block_lines = self.__handle_block(tag=tag, uid=uid, handle_invisible_table=handle_invisible_table)
            lines.extend(block_lines)
        return lines

    def _handle_text_line(self, block: str, path_hash: str, ignore_space: bool = True) -> List[LineWithMeta]:
        if not block.strip() and ignore_space:
            return []
        uid = hashlib.md5(block.encode()).hexdigest()
        line = self.__make_line(block, HierarchyLevel.unknown, 0, uid=uid, path_hash=path_hash)
        return [line]

    def __make_line(self, line: str,
                    line_type: str,
                    header_level: int = 0,
                    uid: str = None,
                    path_hash: str = None,
                    annotations: List = None) -> LineWithMeta:
        if annotations is None:
            annotations = []

        level = None if header_level == 0 else HierarchyLevel(1, header_level, False, line_type=line_type)
        metadata = LineMetadata(page_id=0, line_id=None, tag_hierarchy_level=level)  # TODO line_id

        uid = f"{path_hash}_{uid}"
        return LineWithMeta(line=line, metadata=metadata, annotations=annotations, uid=uid)

    def __get_li_header(self, list_type: str, index: int) -> LineWithMeta:
        end = ") " if list_type in ["a", "A"] else ". "
        if list_type == "":
            header = ""

        elif list_type in ["a", "A"]:
            alphabet = string.ascii_lowercase if list_type == "a" else string.ascii_uppercase
            header = alphabet[index % len(alphabet)]

            while index >= len(alphabet):
                index = index // len(alphabet) - 1
                header = alphabet[index % len(alphabet)] + header

            header = header + end
        else:
            header = str(index + 1) + end
        metadata = LineMetadata(tag_hierarchy_level=HierarchyLevel(2, 1, False, line_type=HierarchyLevel.list_item), page_id=0, line_id=0)
        header_line = LineWithMeta(line=header, metadata=metadata, annotations=[])
        return header_line

    def __read_list(self, lst: Tag, uid: str, path_hash: str, handle_invisible_table: bool) -> List[LineWithMeta]:
        lines = []
        list_type = lst.get("type", "1" if lst.name in HtmlTags.ordered_list else "")
        item_index = 0

        for item in lst:
            if item.name in HtmlTags.list_items:
                item_lines = self.__handle_list_item(item=item,
                                                     item_index=item_index,
                                                     list_type=list_type,
                                                     path_hash=path_hash,
                                                     handle_invisible_table=handle_invisible_table)
                item_index += 1
                lines.extend(item_lines)
        return lines

    def __handle_list_item(self,
                           item: Tag,
                           item_index: int,
                           list_type: str,
                           path_hash: str,
                           handle_invisible_table: bool) -> List[LineWithMeta]:
        lines = []
        header_line = self.__get_li_header(list_type=list_type, index=item_index)
        block_lines = self.__handle_block(item, uid=path_hash, handle_invisible_table=handle_invisible_table)
        hl_depth = header_line.metadata.tag_hierarchy_level.level_1
        for line in block_lines:
            if line.metadata.tag_hierarchy_level.is_unknown():
                header_line += line
            else:
                # Handle complex and nested lists
                lines.append(header_line)
                line.metadata.tag_hierarchy_level.level_1 += hl_depth
                header_line = line
        lines.append(header_line)
        return lines

    # not currently used, but may be useful in the future
    def __get_text(self, tag: Tag) -> [str, int, int]:
        text = tag.getText() + "\n" if tag.name == "p" else tag.getText()
        text = "" if text is None else text
        return text

    def __is_content_tag(self, tag: Tag, handle_invisible_table: bool = False) -> bool:
        """
        check if given tag is a content tag
        @param tag: html tag
        @param handle_invisible_table: is invisibly table should be handled as table
        @return: True if tag is a content tag False otherwise.
        """
        if tag.name in HtmlTags.service_tags:
            return False
        if tag.name == "table" and not self._visible_table(tag, handle_invisible_table=handle_invisible_table):
            return True
        return not isinstance(tag, Doctype) and not isinstance(tag, Comment)

    def __handle_invisible_table(self, block: Tag, path_hash: str) -> List[LineWithMeta]:
        uid = hashlib.md5(block.name.encode()).hexdigest()
        result = []
        rows = self._read_table(block).cells
        for row in rows:
            text = " ".join(row)
            if text.strip() != "":
                tag_uid = hashlib.md5((uid + text).encode()).hexdigest()
                line = self.__make_line(line=text, line_type=HierarchyLevel.unknown, uid=tag_uid, path_hash=path_hash)
                result.append(line)
        return result

    def _read_table(self, table: Tag) -> Table:
        rows = []

        for row in table.find_all(HtmlTags.table_rows):
            rows.append([cell.getText() for cell in row.find_all(HtmlTags.table_cells)])
        return Table(cells=rows, metadata=TableMetadata(page_id=0, uid=str(uuid.uuid1())))

    def _visible_table(self, table: Tag, handle_invisible_table: bool) -> bool:
        if handle_invisible_table:
            return True
        assert table.name == "table", f"block {table} is not table"
        for td in table.find_all("td"):
            style = td.attrs.get("style", "")
            if "border-bottom-style:solid" in style or "border-top-style:solid" in style:
                return True
        return table.attrs.get("border", "0") != "0"