from typing import Any, List, Optional
from dedoc.common.exceptions.bad_file_error import BadFileFormatError
from dedoc.common.exceptions.bad_parameters_error import BadParametersError
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader
[docs]class JsonReader(BaseReader):
"""
This reader allows handle .json files.
"""
def __init__(self, *, config: Optional[dict] = None) -> None:
from dedoc.attachments_extractors.concrete_attachments_extractors.json_attachment_extractor import JsonAttachmentsExtractor
from dedoc.extensions import recognized_extensions, recognized_mimes
super().__init__(config=config, recognized_extensions=recognized_extensions.json_like_format, recognized_mimes=recognized_mimes.json_like_format)
self.attachment_extractor = JsonAttachmentsExtractor(config=self.config)
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
"""
The method return document content with all document's lines and attachments, tables remain empty.
This reader considers json lists as list items and adds this information to the `tag_hierarchy_level`
of :class:`~dedoc.data_structures.LineMetadata`.
The dictionaries are processed by creating key line with type `key` and value line as a child.
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
"""
from json import JSONDecodeError
import ujson as json
from dedoc.data_structures.hierarchy_level import HierarchyLevel
parameters = {} if parameters is None else parameters
with open(file_path) as file:
try:
json_data = json.load(file)
except (JSONDecodeError, ValueError):
raise BadFileFormatError(msg="Seems that json is invalid")
if "html_fields" in parameters:
fields = parameters.get("html_fields", "[]")
try:
key_fields = json.loads(fields if fields else "[]")
except (JSONDecodeError, ValueError):
raise BadParametersError(f"can't read html_fields {fields}")
json_data = self.__exclude_html_fields(json_data, key_fields)
attachments = self.attachment_extractor.extract(file_path=file_path, parameters=parameters)
else:
attachments = []
stack = [(json_data, 1)]
result = []
while len(stack) > 0:
element, depth = stack.pop()
if isinstance(element, dict) and len(element) > 0:
self.__handle_dict(depth, element, result, stack)
if isinstance(element, list) and len(element) > 0:
self.__handle_list(depth, element, result, stack)
elif self.__is_flat(element):
line = self.__handle_one_element(depth=depth, value=str(element), line_type=HierarchyLevel.raw_text, line_type_meta=HierarchyLevel.raw_text)
result.append(line)
return UnstructuredDocument(tables=[], lines=result, attachments=attachments)
def __exclude_html_fields(self, json_data: dict, field_keys: List[List[str]]) -> dict:
for keys in field_keys:
self.__exclude_key(json_data, keys)
return json_data
def __exclude_key(self, json_data: dict, keys: List[str]) -> None:
data = json_data
parents = []
for key in keys[:-1]:
parents.append((data, key))
data = data[key]
del data[keys[-1]]
for (data, key) in parents[::-1]:
if not data[key]:
del data[key]
def __handle_list(self, depth: int, element: list, result: list, stack: list) -> None:
from dedoc.data_structures.hierarchy_level import HierarchyLevel
for _ in range(len(element)):
sub_element = element.pop(0)
line = self.__handle_one_element(depth=depth, value=sub_element, line_type=HierarchyLevel.list_item, line_type_meta=HierarchyLevel.list_item)
result.append(line)
if not self.__is_flat(sub_element):
stack.append((element, depth))
stack.append((sub_element, depth + 1))
break
def __handle_dict(self, depth: int, element: dict, result: list, stack: list) -> None:
for key in sorted(element.keys()):
# key = min(element.keys()) if len(element) < 100 else list(element.keys())[0]
value = element.pop(key)
line = self.__handle_one_element(depth=depth, value=key, line_type="key", line_type_meta="key")
result.append(line)
stack.append((element, depth))
if value is not None:
stack.append((value, depth + 1))
break
def __handle_one_element(self, depth: int, value: Any, line_type: str, line_type_meta: str) -> LineWithMeta: # noqa
from dedoc.data_structures.hierarchy_level import HierarchyLevel
from dedoc.data_structures.line_metadata import LineMetadata
if depth == 1 and line_type == "title":
level1, level2 = 0, 0
else:
level1, level2 = depth, 1
hierarchy_level = HierarchyLevel(level_1=level1, level_2=level2, can_be_multiline=False, line_type=line_type_meta)
metadata = LineMetadata(tag_hierarchy_level=hierarchy_level, page_id=0, line_id=None)
line = LineWithMeta(line=self.__get_text(value), metadata=metadata)
return line
def __is_flat(self, value: Any) -> bool: # noqa
return not isinstance(value, (dict, list))
def __get_text(self, value: Any) -> str: # noqa
if isinstance(value, (dict, list)) or value is None:
return ""
return str(value)