Source code for dedoc.readers.pdf_reader.utils.header_footers_analysis

import re
from collections import Counter
from typing import List, Optional, Tuple

import numpy as np

from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.utils.utils import similarity


[docs]class HeaderFooterDetector:
    """
    Class detects header and footer textual lines.
    The algorithm was implemented according to the article:

    `Lin X. Header and footer extraction by page association //Document Recognition and Retrieval X. – SPIE, 2003. – Т. 5010. – С. 164-171.`

    Algorithm's notes:

        1. For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers.
           For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared.
           Therefore, alternating footers-headers will not be detected on documents of less than 6 pages.

        2. The algorithm analyzes the first 4 and last 4 lines on each page of the document and,
           by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity.

        3. For algorithm work, the document must have at least two pages of text.
           It is not an ML algorithm so it cannot work with just one page.

        4. The more pages, the better. Remember that the parameter `pages` limits the number of pages in a document.
    """

    def __init__(self) -> None:
        # 1 - first 4 weight for header, last 4 weight for footer
        self.weights = [1.0, 1.0, 0.85, 0.75, 0.75, 0.85, 1.0, 1.0]
        self.max_cnt_lines = len(self.weights)
        self.pattern_roman = r"\b[IVXLCDM]+\.?\b|\b[ivxlcdm]+\.?\b"
        self.pattern_digits = r"\d+"

    def detect(self, lines: List[List[LineWithLocation]], is_header_footer_threshold: float = 0.5) \
            -> Tuple[List[List[LineWithLocation]], List[List[LineWithLocation]], List[List[LineWithLocation]]]:

        scores = np.zeros(shape=(self.max_cnt_lines,), dtype=float)
        patterns = [[] for _ in range(self.max_cnt_lines)]
        cnt_cmpr = 0

        lines = self._strip_empty_lines(lines)
        page_cnt = len(lines)
        step = 2 if page_cnt > 5 else 1  # between one page for a big document (with changed header-footers)

        # 2 - formed comparison pattern for similarity
        for page in range(page_cnt):
            for line_index in range(self.max_cnt_lines // 2):
                if len(lines[page]) < self.max_cnt_lines:
                    patterns[line_index].append(None)
                    patterns[-line_index - 1].append(None)
                else:
                    patterns[line_index].append(self._replace_roman_and_digits_strict(lines[page][line_index].line))
                    patterns[-line_index - 1].append(self._replace_roman_and_digits_strict(lines[page][-line_index - 1].line))

        # 3 - calculate score of each header-footer line
        for page_one in range(page_cnt - step):
            page_two = page_one + step
            if len(lines[page_one]) < self.max_cnt_lines or len(lines[page_two]) < self.max_cnt_lines:
                continue
            # calc score for header
            for line_index in range(self.max_cnt_lines // 2):
                # calculation header score
                scores[line_index] += self.weights[line_index] * self._similarity(s1=patterns[line_index][page_one], s2=patterns[line_index][page_two])
                # calculation footer score
                similarity = self._similarity(s1=patterns[-line_index - 1][page_one], s2=patterns[-line_index - 1][page_two])
                scores[-line_index - 1] += self.weights[-line_index - 1] * similarity
            cnt_cmpr += 1

        scores /= cnt_cmpr
        is_footer_header = scores > is_header_footer_threshold

        # 4 - get the popular pattern from lines with high scores
        popular_patterns = self._get_popular_pattern(is_footer_header=is_footer_header, threshold=0.4 if step == 2 else 0.7, patterns=patterns)

        # 5 - delete only those lines which match with popular patterns
        headers, footers = [], []
        for page_id in range(page_cnt):
            headers.append([])
            footers.append([])

            for line_id in range(self.max_cnt_lines // 2):
                header = self._remove_header_footer(is_footer_header, popular_patterns, lines, page_id, line_id)
                if header:
                    lines[page_id][line_id] = None
                    headers[-1].append(header)

                footer = self._remove_header_footer(is_footer_header, popular_patterns, lines, page_id, -line_id - 1)
                if footer:
                    lines[page_id][-line_id - 1] = None
                    footers[-1].append(footer)

        # remove None-elements
        lines = [[line for line in page if line] for page in lines]

        return lines, headers, footers

    def _replace_roman_and_digits_strict(self, text: str) -> str:
        result = re.sub(self.pattern_roman, "@", text)
        result = re.sub(self.pattern_digits, "@", result)
        result = re.sub(r"@+", "@", result)

        return result.strip()

    def _similarity(self, s1: str, s2: str) -> float:
        if len(s1) == 0 or len(s2) == 0:
            return 0.0
        return similarity(s1, s2)

    def _strip_empty_lines(self, lines: List[List[LineWithLocation]]) -> List[List[LineWithLocation]]:
        reg_empty_string = re.compile(r"^\s*\n$")
        for page_id in range(len(lines)):
            line_id_begin_content = 0
            line_count = len(lines[page_id])
            while line_id_begin_content < line_count:
                if reg_empty_string.match(lines[page_id][line_id_begin_content].line) is None:
                    break
                line_id_begin_content += 1

            line_id_end_content = line_count - 1
            while line_id_end_content > 0:
                if reg_empty_string.match(lines[page_id][line_id_end_content].line) is None:
                    break
                line_id_end_content -= 1

            lines[page_id] = lines[page_id][line_id_begin_content:line_id_end_content + 1]

        return lines

    def _remove_header_footer(self, is_footer_header: np.ndarray,
                              popular_patterns: List[List[str]],
                              lines: List[List[LineWithLocation]],
                              page_id: int, line_id: int) -> Optional[LineWithLocation]:

        if not is_footer_header[line_id] or abs(line_id) >= len(lines[page_id]):
            return None
        for pattern in popular_patterns[line_id]:
            try:
                if re.match(pattern, self._replace_roman_and_digits_strict(lines[page_id][line_id].line)):
                    return lines[page_id][line_id]
            except re.error:
                pass

        return None

    def _get_popular_pattern(self, is_footer_header: np.ndarray, threshold: float, patterns: List[List[str]]) -> List[List[str]]:
        # Algorithm if header takes more than 40% of changed header-footer of doc
        #                       and more 70% in the doc with const header-footers
        #                        is_footer_header = [True,              False, False, False, True,            True         ]
        # Result example: popular_patterns_of_hf = [["header of company"],[], [], [], ["- @ -"], ["- @ -", "Robert's team"]]
        #                                          [------------ headers -------],[----------------footers-----------------]

        popular_patterns = [[] for _ in range(self.max_cnt_lines)]

        for num, pattern in enumerate(patterns):
            if not is_footer_header[num]:
                continue
            filter_pattern = [p for p in pattern if p]
            uniques = np.array(list(Counter(filter_pattern).keys()))
            freqs = np.array(list(Counter(filter_pattern).values())) / len(filter_pattern)
            popular_patterns[num].extend([pattern for num, pattern in enumerate(uniques) if freqs[num] > threshold])

        return popular_patterns