Source code for dedoc.readers.pdf_reader.utils.header_footers_analysis

import re
from collections import Counter
from typing import List, Optional, Tuple

import numpy as np

from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.utils.utils import similarity


[docs]class HeaderFooterDetector: """ Class detects header and footer textual lines. The algorithm was implemented according to the article: `Lin X. Header and footer extraction by page association //Document Recognition and Retrieval X. – SPIE, 2003. – Т. 5010. – С. 164-171.` Algorithm's notes: 1. For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers. For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared. Therefore, alternating footers-headers will not be detected on documents of less than 6 pages. 2. The algorithm analyzes the first 4 and last 4 lines on each page of the document and, by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity. 3. For algorithm work, the document must have at least two pages of text. It is not an ML algorithm so it cannot work with just one page. 4. The more pages, the better. Remember that the parameter `pages` limits the number of pages in a document. """ def __init__(self) -> None: # 1 - first 4 weight for header, last 4 weight for footer self.weights = [1.0, 1.0, 0.85, 0.75, 0.75, 0.85, 1.0, 1.0] self.max_cnt_lines = len(self.weights) self.pattern_roman = r"\b[IVXLCDM]+\.?\b|\b[ivxlcdm]+\.?\b" self.pattern_digits = r"\d+" def detect(self, lines: List[List[LineWithLocation]], is_header_footer_threshold: float = 0.5) \ -> Tuple[List[List[LineWithLocation]], List[List[LineWithLocation]], List[List[LineWithLocation]]]: scores = np.zeros(shape=(self.max_cnt_lines,), dtype=float) patterns = [[] for _ in range(self.max_cnt_lines)] cnt_cmpr = 0 lines = self._strip_empty_lines(lines) page_cnt = len(lines) step = 2 if page_cnt > 5 else 1 # between one page for a big document (with changed header-footers) # 2 - formed comparison pattern for similarity for page in range(page_cnt): for line_index in range(self.max_cnt_lines // 2): if len(lines[page]) < self.max_cnt_lines: patterns[line_index].append(None) patterns[-line_index - 1].append(None) else: patterns[line_index].append(self._replace_roman_and_digits_strict(lines[page][line_index].line)) patterns[-line_index - 1].append(self._replace_roman_and_digits_strict(lines[page][-line_index - 1].line)) # 3 - calculate score of each header-footer line for page_one in range(page_cnt - step): page_two = page_one + step if len(lines[page_one]) < self.max_cnt_lines or len(lines[page_two]) < self.max_cnt_lines: continue # calc score for header for line_index in range(self.max_cnt_lines // 2): # calculation header score scores[line_index] += self.weights[line_index] * self._similarity(s1=patterns[line_index][page_one], s2=patterns[line_index][page_two]) # calculation footer score similarity = self._similarity(s1=patterns[-line_index - 1][page_one], s2=patterns[-line_index - 1][page_two]) scores[-line_index - 1] += self.weights[-line_index - 1] * similarity cnt_cmpr += 1 scores /= cnt_cmpr is_footer_header = scores > is_header_footer_threshold # 4 - get the popular pattern from lines with high scores popular_patterns = self._get_popular_pattern(is_footer_header=is_footer_header, threshold=0.4 if step == 2 else 0.7, patterns=patterns) # 5 - delete only those lines which match with popular patterns headers, footers = [], [] for page_id in range(page_cnt): headers.append([]) footers.append([]) for line_id in range(self.max_cnt_lines // 2): header = self._remove_header_footer(is_footer_header, popular_patterns, lines, page_id, line_id) if header: lines[page_id][line_id] = None headers[-1].append(header) footer = self._remove_header_footer(is_footer_header, popular_patterns, lines, page_id, -line_id - 1) if footer: lines[page_id][-line_id - 1] = None footers[-1].append(footer) # remove None-elements lines = [[line for line in page if line] for page in lines] return lines, headers, footers def _replace_roman_and_digits_strict(self, text: str) -> str: result = re.sub(self.pattern_roman, "@", text) result = re.sub(self.pattern_digits, "@", result) result = re.sub(r"@+", "@", result) return result.strip() def _similarity(self, s1: str, s2: str) -> float: if len(s1) == 0 or len(s2) == 0: return 0.0 return similarity(s1, s2) def _strip_empty_lines(self, lines: List[List[LineWithLocation]]) -> List[List[LineWithLocation]]: reg_empty_string = re.compile(r"^\s*\n$") for page_id in range(len(lines)): line_id_begin_content = 0 line_count = len(lines[page_id]) while line_id_begin_content < line_count: if reg_empty_string.match(lines[page_id][line_id_begin_content].line) is None: break line_id_begin_content += 1 line_id_end_content = line_count - 1 while line_id_end_content > 0: if reg_empty_string.match(lines[page_id][line_id_end_content].line) is None: break line_id_end_content -= 1 lines[page_id] = lines[page_id][line_id_begin_content:line_id_end_content + 1] return lines def _remove_header_footer(self, is_footer_header: np.ndarray, popular_patterns: List[List[str]], lines: List[List[LineWithLocation]], page_id: int, line_id: int) -> Optional[LineWithLocation]: if not is_footer_header[line_id] or abs(line_id) >= len(lines[page_id]): return None for pattern in popular_patterns[line_id]: try: if re.match(pattern, self._replace_roman_and_digits_strict(lines[page_id][line_id].line)): return lines[page_id][line_id] except re.error: pass return None def _get_popular_pattern(self, is_footer_header: np.ndarray, threshold: float, patterns: List[List[str]]) -> List[List[str]]: # Algorithm if header takes more than 40% of changed header-footer of doc # and more 70% in the doc with const header-footers # is_footer_header = [True, False, False, False, True, True ] # Result example: popular_patterns_of_hf = [["header of company"],[], [], [], ["- @ -"], ["- @ -", "Robert's team"]] # [------------ headers -------],[----------------footers-----------------] popular_patterns = [[] for _ in range(self.max_cnt_lines)] for num, pattern in enumerate(patterns): if not is_footer_header[num]: continue filter_pattern = [p for p in pattern if p] uniques = np.array(list(Counter(filter_pattern).keys())) freqs = np.array(list(Counter(filter_pattern).values())) / len(filter_pattern) popular_patterns[num].extend([pattern for num, pattern in enumerate(uniques) if freqs[num] > threshold]) return popular_patterns