Source code for dedoc.structure_extractors.abstract_structure_extractor

from abc import ABC, abstractmethod
from typing import List, Optional

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument


[docs]class AbstractStructureExtractor(ABC):
    """
    This class adds additional information to the given unstructured document (list of lines) received from some of the readers.
    Types of lines (paragraph_type) and their levels (hierarchy_level) in the document are added.

    The hierarchy level of the line shows the importance of the line in the document: the more important the line is, the less level value it has.
    Look at the class :class:`dedoc.data_structures.HierarchyLevel` for more information.

    The paragraph type of the line should be one of the predefined types for some certain document domain, e.g. header, list_item, raw_text, etc.
    Each concrete structure extractor defines the rules of structuring: the levels and possible types of the lines.
    """
[docs]    def __init__(self, *, config: Optional[dict] = None) -> None:
        """
        :param config: configuration of the extractor, e.g. logger for logging
        """
        import logging

        self.config = {} if config is None else config
        self.logger = self.config.get("logger", logging.getLogger())

[docs]    @abstractmethod
    def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
        """
        This method extracts structure for the document content received from some reader:
        it finds lines types and their hierarchy levels and adds them to the lines' metadata.

        :param document: document content that has been received from some of the readers
        :param parameters: additional parameters for document parsing, see :ref:`structure_type_parameters` for more details
        :return: document content with added additional information about lines types and hierarchy levels
        """
        pass

    def _postprocess(self, lines: List[LineWithMeta], paragraph_type: List[str], regexps: List, excluding_regexps: List) -> List[LineWithMeta]:
        """
        The function searches for which of regular expressions (for regexps parameters) the string matches.
        If there is match, then additional node is creating.
        To filter out garbage (extra letters, spaces, etc.),
        the excluding_regexps is applied after for the matched substring (for example: "1.П"->"1.", "4.7.\t"->"4.7.")

        :param lines: input lines
        :param paragraph_type: list of paragraph types
        :param regexps: list of regular pattern according to the list of paragraph types
        :param excluding_regexps: list of filtering garbage regular pattern according to list of paragraph types
        :return: new post-processed list of LineWithMeta
        """
        from copy import deepcopy
        from dedoc.data_structures.hierarchy_level import HierarchyLevel

        if self.config.get("labeling_mode", False):
            return lines

        result = []
        for line in lines:
            if line.metadata.hierarchy_level.is_raw_text() and len(line.line) == 0:  # skip empty raw text
                continue
            if line.metadata.hierarchy_level.line_type in paragraph_type:
                matched = False
                for num, regexp in enumerate(regexps):
                    match = regexp.match(line.line)

                    if match:
                        matched = True
                        start = match.start()
                        end = match.end()
                        if excluding_regexps[num]:
                            match_excluding = excluding_regexps[num].search(line.line[start:end])
                            end = match_excluding.start() if match_excluding else end

                        result.append(LineWithMeta(line=line.line[start:end],
                                                   metadata=line.metadata,
                                                   annotations=self._select_annotations(line.annotations, start, end),
                                                   uid=line.uid))
                        metadata = deepcopy(line.metadata)
                        metadata.hierarchy_level = HierarchyLevel.create_raw_text()

                        rest_text = line.line[end:]
                        if len(rest_text) > 0:
                            annotations = self._select_annotations(line.annotations, end, len(line.line))
                            result.append(LineWithMeta(line=rest_text, metadata=metadata, annotations=annotations, uid=line.uid + "_split"))
                        break
                if not matched:
                    result.append(line)
            else:
                result.append(line)

        return result

    @staticmethod
    def _select_annotations(annotations: List[Annotation], start: int, end: int) -> List[Annotation]:
        from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
        from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation

        assert start <= end
        res = []
        for annotation in annotations:
            if annotation.name in [TableAnnotation.name, AttachAnnotation.name]:
                if start == 0:
                    new_annotation = Annotation(start=start, end=end, value=annotation.value, name=annotation.name)
                    res.append(new_annotation)
            elif annotation.end > start and annotation.start <= end:
                new_start = max(annotation.start, start) - start
                new_end = min(annotation.end, end) - start
                new_annotation = Annotation(start=new_start, end=new_end, value=annotation.value, name=annotation.name)
                res.append(new_annotation)
        return res