Source code for dedoc.readers.txt_reader.raw_text_reader

from typing import Iterable, List, Optional, Tuple

from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class RawTextReader(BaseReader): """ This class allows to parse files with the following extensions: .txt, .txt.gz """ def __init__(self, *, config: Optional[dict] = None) -> None: import re from dedoc.extensions import recognized_extensions, recognized_mimes super().__init__(config=config, recognized_extensions=recognized_extensions.txt_like_format, recognized_mimes=recognized_mimes.txt_like_format) self.space_regexp = re.compile(r"^\s+")
[docs] def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool: """ Check if the document extension is suitable for this reader. Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters. """ from dedoc.utils.utils import get_mime_extension mime, extension = get_mime_extension(file_path=file_path, mime=mime, extension=extension) # this code differs from BaseReader because other formats can have text/plain mime type if extension: return extension.lower() in self._recognized_extensions return mime in self._recognized_mimes
[docs] def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument: """ This method returns only document lines. Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters. """ parameters = {} if parameters is None else parameters encoding = self.__get_encoding(path=file_path, parameters=parameters) lines = self._get_lines_with_meta(path=file_path, encoding=encoding) encoding_warning = f"encoding is {encoding}" result = UnstructuredDocument(lines=lines, tables=[], attachments=[], warnings=[encoding_warning]) return self._postprocess(result)
def __get_encoding(self, path: str, parameters: dict) -> str: from dedoc.utils.utils import get_encoding if parameters.get("encoding"): return parameters["encoding"] else: return get_encoding(path, "utf-8") def _get_lines_with_meta(self, path: str, encoding: str) -> List[LineWithMeta]: import time from dedoc.data_structures.concrete_annotations.spacing_annotation import SpacingAnnotation from dedoc.data_structures.hierarchy_level import HierarchyLevel from dedoc.data_structures.line_metadata import LineMetadata from dedoc.utils.utils import calculate_file_hash lines = [] file_hash = calculate_file_hash(path=path) number_of_empty_lines = 0 previous_log_time = time.time() for line_id, line in self.__get_lines(path=path, encoding=encoding): if time.time() - previous_log_time > 5: self.logger.info(f"done {line_id} lines") previous_log_time = time.time() metadata = LineMetadata(page_id=0, line_id=line_id) uid = f"txt_{file_hash}_{line_id}" spacing_annotation_value = str(int(100 * (0.5 if number_of_empty_lines == 0 else number_of_empty_lines))) spacing_annotation = SpacingAnnotation(start=0, end=len(line), value=spacing_annotation_value) indent_annotation = self.__get_indent_annotation(line) line_with_meta = LineWithMeta(line=line, metadata=metadata, annotations=[spacing_annotation, indent_annotation], uid=uid) line_with_meta.metadata.tag_hierarchy_level = HierarchyLevel.create_unknown() lines.append(line_with_meta) number_of_empty_lines = number_of_empty_lines + 1 if line.isspace() else 0 return lines def __get_lines(self, path: str, encoding: str) -> Iterable[Tuple[int, str]]: import codecs import gzip from unicodedata import normalize if path.lower().endswith("txt"): with codecs.open(path, errors="ignore", encoding=encoding) as file: for line_id, line in enumerate(file): line = normalize("NFC", line).replace("й", "й") # й replace matter yield line_id, line else: with gzip.open(path) as file: for line_id, line in enumerate(file): line = line.decode(encoding) line = normalize("NFC", line).replace("й", "й") yield line_id, line def __get_starting_spacing(self, line: Optional[LineWithMeta]) -> int: if line is None or line.line.isspace(): return 0 space_this = self.space_regexp.match(line.line.replace("\t", " " * 4)) if space_this is None: return 0 return space_this.end() - space_this.start() def __is_paragraph(self, line: LineWithMeta, previous_line: Optional[LineWithMeta]) -> bool: space_this = self.__get_starting_spacing(line) space_prev = self.__get_starting_spacing(previous_line) return not line.line.isspace() and space_this - space_prev >= 2 def _postprocess(self, document: UnstructuredDocument) -> UnstructuredDocument: previous_line = None for line in document.lines: is_paragraph = self.__is_paragraph(line=line, previous_line=previous_line) line.metadata.tag_hierarchy_level.can_be_multiline = not is_paragraph previous_line = line return document def __get_indent_annotation(self, line: str) -> IndentationAnnotation: space_group = self.space_regexp.match(line) if space_group is None: return IndentationAnnotation(start=0, end=len(line), value="0") space_cnt = 0 for char in space_group.group(): space_cnt += 3 if char == "\t" else 1 return IndentationAnnotation(start=0, end=len(line), value=str(211 * space_cnt))