Source code for dedoc.structure_extractors.patterns.regexp_pattern

import re
from typing import Optional, Union

from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.structure_extractors.patterns.abstract_pattern import AbstractPattern


[docs]class RegexpPattern(AbstractPattern):
    """
    Pattern for matching line text by a regular expression.

    .. note::

        The pattern is case-insensitive (lower and upper letters are not differed).
        Before regular expression matching, the line text is stripped (space symbols are deleted from both sides).

    .. seealso::

        Syntax for writing regular expressions is described in the `Python documentation <https://docs.python.org/3/library/re.html>`_.

    Example of library usage:

    .. code-block:: python

        import re
        from dedoc.structure_extractors import DefaultStructureExtractor
        from dedoc.structure_extractors.patterns import RegexpPattern

        reader = ...
        structure_extractor = DefaultStructureExtractor()
        patterns = [
            RegexpPattern(regexp="^chapter\s\d+\.", line_type="chapter", level_1=1, can_be_multiline=False),
            RegexpPattern(regexp=re.compile(r"^part\s\d+\.\d+\."), line_type="part", level_1=2, can_be_multiline=False)
        ]
        document = reader.read(file_path=file_path)
        document = structure_extractor.extract(document=document, parameters={"patterns": patterns})

    Example of API usage:

    .. code-block:: python

        import requests

        patterns = [{"name": "regexp", "regexp": "^chapter\s\d+\.", "line_type": "chapter", "level_1": 1, "can_be_multiline": "false"}]
        parameters = {"patterns": str(patterns)}
        with open(file_path, "rb") as file:
            files = {"file": (file_name, file)}
            r = requests.post("http://localhost:1231/upload", files=files, data=parameters)
    """ # noqa
    _name = "regexp"

[docs]    def __init__(self,
                 regexp: str or re.Pattern,
                 line_type: str,
                 level_1: Optional[int] = None,
                 level_2: Optional[int] = None,
                 can_be_multiline: Optional[Union[bool, str]] = None) -> None:
        """
        Initialize pattern with default values of :class:`~dedoc.data_structures.HierarchyLevel` attributes.

        :param regexp: regular expression for checking, if the line text matches the pattern.
            Note that regular expression is used on the lowercase and stripped line.
        :param line_type: type of the line, e.g. "header", "bullet_list_item", "chapter", etc.
        :param level_1: value of a line primary importance
        :param level_2: level of the line inside specific class
        :param can_be_multiline: is used to unify lines inside tree node by :class:`~dedoc.structure_constructors.TreeConstructor`,
            if line can be multiline, it can be joined with another line. If ``None`` is given, can_be_multiline is set to ``True``.
        """
        super().__init__(line_type=line_type, level_1=level_1, level_2=level_2, can_be_multiline=can_be_multiline)
        self._regexp = re.compile(regexp) if isinstance(regexp, str) else regexp

[docs]    def match(self, line: LineWithMeta) -> bool:
        """
        Check if the pattern is suitable for the given line.
        Line text is checked by applying pattern's regular expression, text is stripped and made lowercase beforehand.
        """
        text = line.line.strip().lower()
        match = self._regexp.match(text)
        return match is not None

[docs]    def get_hierarchy_level(self, line: LineWithMeta) -> HierarchyLevel:
        """
        This method should be applied only when :meth:`~dedoc.structure_extractors.patterns.RegexpPattern.match`
        returned ``True`` for the given line.

        Return :class:`~dedoc.data_structures.HierarchyLevel` for initialising ``line.metadata.hierarchy_level``.
        The attributes ``line_type``, ``level_1``, ``level_2``, ``can_be_multiline`` are equal to values given during class initialisation.
        """
        return HierarchyLevel(line_type=self._line_type, level_1=self._level_1, level_2=self._level_2, can_be_multiline=self._can_be_multiline)