Source code for dedoc.data_structures.tree_node

from typing import List, Optional

from dedoc.api.schema.tree_node import TreeNode as ApiTreeNode
from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_metadata import LineMetadata
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.serializable import Serializable


[docs]class TreeNode(Serializable): """ TreeNode helps to represent document as recursive tree structure. It has parent node (None for root ot the tree) and list of children nodes (empty list for list node). :ivar node_id: unique node identifier :ivar text: text of the node (may contain several lines) :ivar annotations: some metadata related to the part of the text (as font size) :ivar metadata: metadata refers to entire node (as node type) :ivar subparagraphs: list of child of this node :ivar parent: parent node (None for root, not none for other nodes) :vartype node_id: str :vartype text: str :vartype annotations: List[Annotation] :vartype metadata: LineMetadata :vartype subparagraphs: List[TreeNode] :vartype parent: TreeNode """ def __init__(self, node_id: str, text: str, annotations: List[Annotation], metadata: LineMetadata, subparagraphs: List["TreeNode"], parent: Optional["TreeNode"]) -> None: """ :param node_id: node id is unique in one document :param text: text of the node :param annotations: metadata related to the part of the text :param metadata: metadata refers to entire node :param subparagraphs: list of child of this node :param parent: parent node """ self.node_id: str = node_id self.text: str = text self.annotations: List[Annotation] = annotations self.metadata: LineMetadata = metadata self.subparagraphs: List["TreeNode"] = subparagraphs self.parent: "TreeNode" = parent
[docs] def to_api_schema(self) -> ApiTreeNode: annotations = [annotation.to_api_schema() for annotation in self.annotations] metadata = self.metadata.to_api_schema() subparagraphs = [node.to_api_schema() for node in self.subparagraphs] return ApiTreeNode(node_id=self.node_id, text=self.text, annotations=annotations, metadata=metadata, subparagraphs=subparagraphs)
[docs] @staticmethod def create(lines: List[LineWithMeta] = None) -> "TreeNode": """ Creates a root node with given text. :param lines: this lines should be the title of the document (or should be empty for documents without title) :return: root of the document tree """ page_id = 0 if len(lines) == 0 else min((line.metadata.page_id for line in lines)) line_id = 0 if len(lines) == 0 else min((line.metadata.line_id for line in lines)) metadata = LineMetadata(page_id=page_id, line_id=line_id, hierarchy_level=HierarchyLevel.create_root()) texts = (line.line for line in lines) annotations = [] text_length = 0 for line in lines: annotations.extend(TreeNode.__shift_annotations(line=line, text_length=text_length)) TreeNode.__add_additional_page_id(start=text_length, metadata=metadata, other_line=line) text_length += len(line.line) text = "".join(texts) return TreeNode("0", text, annotations=annotations, metadata=metadata, subparagraphs=[], parent=None)
[docs] def add_child(self, line: LineWithMeta) -> "TreeNode": """ Create a new tree node - children of the given node from given line. Return newly created node. :param line: Line with meta, new node will be built from this line :return: return created node (child of the self) """ new_node = TreeNode( node_id=f"{self.node_id}.{len(self.subparagraphs)}", text=line.line, annotations=line.annotations, metadata=line.metadata, subparagraphs=[], parent=self ) self.subparagraphs.append(new_node) return new_node
[docs] def add_text(self, line: LineWithMeta) -> None: """ Add the text and annotations from given line, text is separated with aa len line symbol. :param line: line with text to add """ text_length = len(self.text) new_annotations = self.__shift_annotations(line, text_length) self.__add_additional_page_id(start=len(self.text), metadata=self.metadata, other_line=line) self.text += line.line self.annotations.extend(new_annotations)
@staticmethod def __shift_annotations(line: LineWithMeta, text_length: int) -> List[Annotation]: new_annotations = [] for annotation in line.annotations: new_annotation = Annotation(start=annotation.start + text_length, end=annotation.end + text_length, name=annotation.name, value=annotation.value) new_annotations.append(new_annotation) return new_annotations
[docs] def get_root(self) -> "TreeNode": """ :return: root of the tree """ node = self while node.parent is not None: node = node.parent return node
def merge_annotations(self) -> None: from dedoc.utils.annotation_merger import AnnotationMerger root = self.get_root() stack = [root] merger = AnnotationMerger() while len(stack) > 0: node = stack.pop() node.annotations = merger.merge_annotations(node.annotations, node.text) for sub_node in node.subparagraphs: stack.append(sub_node) @staticmethod def __add_additional_page_id(start: int, metadata: LineMetadata, other_line: LineWithMeta) -> None: """ Adds additional page_id metadata for multi-page nodes. If node is located on several pages, its metadata will contain "additional_page_id" attribute with list of dicts: { start: start index of the text on the next page, end: end index (not included), page_id: page id, where this textual part (node_text[start:end]) is located } """ if metadata.page_id == other_line.metadata.page_id: return if hasattr(metadata, "additional_page_ids"): last_page_id = metadata.additional_page_ids[-1]["page_id"] if last_page_id == other_line.metadata.page_id: metadata.additional_page_ids[-1]["end"] = start + len(other_line.line) return additional_page_id = {"start": start, "end": start + len(other_line.line), "page_id": other_line.metadata.page_id} if hasattr(metadata, "additional_page_ids"): metadata.additional_page_ids.append(additional_page_id) else: metadata.additional_page_ids = [additional_page_id]