dedoc
v1.1.1
Getting started:
Dedoc installation
Dedoc usage tutorial
Tutorials
Adding support for a new document type to Dedoc
Creating Dedoc Document from basic data structures in code
Dedoc API usage
Using dedoc via API
API schema
Description of the API output format
Readers output
Text annotations
Types of textual lines
Structure types
Default document structure type
Law structure type
Technical specification structure type
Diploma structure type
Package Reference
Dedoc pipeline
dedoc.data_structures
dedoc.converters
dedoc.readers
dedoc.attachments_extractors
dedoc.metadata_extractors
dedoc.structure_extractors
dedoc.structure_constructors
Notes
Changelog
dedoc
Index
Edit on GitHub
Index
_
|
A
|
B
|
C
|
D
|
E
|
F
|
G
|
H
|
I
|
J
|
L
|
M
|
N
|
O
|
P
|
R
|
S
|
T
|
U
|
V
|
W
|
X
|
Y
_
__add__() (dedoc.data_structures.LineWithMeta method)
__eq__() (dedoc.data_structures.HierarchyLevel method)
__getitem__() (dedoc.data_structures.LineWithMeta method)
__init__() (dedoc.attachments_extractors.PDFAttachmentsExtractor method)
(dedoc.attachments_handler.AttachmentsHandler method)
(dedoc.converters.AbstractConverter method)
(dedoc.converters.FileConverterComposition method)
(dedoc.data_structures.AlignmentAnnotation method)
(dedoc.data_structures.Annotation method)
(dedoc.data_structures.AttachAnnotation method)
(dedoc.data_structures.AttachedFile method)
(dedoc.data_structures.BBoxAnnotation method)
(dedoc.data_structures.BoldAnnotation method)
(dedoc.data_structures.CellWithMeta method)
(dedoc.data_structures.ColorAnnotation method)
(dedoc.data_structures.ConfidenceAnnotation method)
(dedoc.data_structures.DocumentContent method)
(dedoc.data_structures.DocumentMetadata method)
(dedoc.data_structures.HierarchyLevel method)
(dedoc.data_structures.IndentationAnnotation method)
(dedoc.data_structures.ItalicAnnotation method)
(dedoc.data_structures.LineMetadata method)
(dedoc.data_structures.LineWithMeta method)
(dedoc.data_structures.LinkedTextAnnotation method)
(dedoc.data_structures.ParsedDocument method)
(dedoc.data_structures.SizeAnnotation method)
(dedoc.data_structures.SpacingAnnotation method)
(dedoc.data_structures.StrikeAnnotation method)
(dedoc.data_structures.StyleAnnotation method)
(dedoc.data_structures.SubscriptAnnotation method)
(dedoc.data_structures.SuperscriptAnnotation method)
(dedoc.data_structures.Table method)
(dedoc.data_structures.TableAnnotation method)
(dedoc.data_structures.TableMetadata method)
(dedoc.data_structures.TreeNode method)
(dedoc.data_structures.UnderlinedAnnotation method)
(dedoc.data_structures.UnstructuredDocument method)
(dedoc.DedocManager method)
(dedoc.metadata_extractors.ImageMetadataExtractor method)
(dedoc.metadata_extractors.MetadataExtractorComposition method)
(dedoc.metadata_extractors.NoteMetadataExtractor method)
(dedoc.metadata_extractors.PdfMetadataExtractor method)
(dedoc.readers.ArchiveReader method)
(dedoc.readers.CSVReader method)
(dedoc.readers.DocxReader method)
(dedoc.readers.EmailReader method)
(dedoc.readers.ExcelReader method)
(dedoc.readers.HtmlReader method)
(dedoc.readers.JsonReader method)
(dedoc.readers.MhtmlReader method)
(dedoc.readers.NoteReader method)
(dedoc.readers.PdfAutoReader method)
(dedoc.readers.PdfBaseReader method)
(dedoc.readers.PdfImageReader method)
(dedoc.readers.PdfTabbyReader method)
(dedoc.readers.PdfTxtlayerReader method)
(dedoc.readers.PptxReader method)
(dedoc.readers.RawTextReader method)
(dedoc.readers.ReaderComposition method)
(dedoc.structure_constructors.StructureConstructorComposition method)
(dedoc.structure_extractors.AbstractLawStructureExtractor method)
(dedoc.structure_extractors.ClassifyingLawStructureExtractor method)
(dedoc.structure_extractors.DiplomaStructureExtractor method)
(dedoc.structure_extractors.FoivLawStructureExtractor method)
(dedoc.structure_extractors.LawStructureExtractor method)
(dedoc.structure_extractors.StructureExtractorComposition method)
(dedoc.structure_extractors.TzStructureExtractor method)
(dedocutils.data_structures.BBox method)
__len__() (dedoc.data_structures.LineWithMeta method)
__lt__() (dedoc.data_structures.HierarchyLevel method)
(dedoc.data_structures.LineWithMeta method)
A
AbstractAttachmentsExtractor (class in dedoc.attachments_extractors)
AbstractConverter (class in dedoc.converters)
AbstractLawStructureExtractor (class in dedoc.structure_extractors)
AbstractMetadataExtractor (class in dedoc.metadata_extractors)
AbstractOfficeAttachmentsExtractor (class in dedoc.attachments_extractors)
AbstractStructureConstructor (class in dedoc.structure_constructors)
AbstractStructureExtractor (class in dedoc.structure_extractors)
access_time (dedoc.api.schema.DocumentMetadata attribute)
add_child() (dedoc.data_structures.TreeNode method)
add_text() (dedoc.data_structures.TreeNode method)
AlignmentAnnotation (class in dedoc.data_structures)
Annotation (class in dedoc.api.schema)
(class in dedoc.data_structures)
annotations (dedoc.api.schema.LineWithMeta attribute)
(dedoc.api.schema.TreeNode attribute)
(dedoc.data_structures.LineWithMeta property)
ArchiveReader (class in dedoc.readers)
AttachAnnotation (class in dedoc.data_structures)
AttachedFile (class in dedoc.data_structures)
attachments (dedoc.api.schema.ParsedDocument attribute)
AttachmentsHandler (class in dedoc.attachments_handler)
B
BaseMetadataExtractor (class in dedoc.metadata_extractors)
BaseReader (class in dedoc.readers)
BBox (class in dedocutils.data_structures)
BBoxAnnotation (class in dedoc.data_structures)
BinaryConverter (class in dedoc.converters)
BoldAnnotation (class in dedoc.data_structures)
C
can_convert() (dedoc.converters.AbstractConverter method)
(dedoc.converters.BinaryConverter method)
(dedoc.converters.DocxConverter method)
(dedoc.converters.ExcelConverter method)
(dedoc.converters.PDFConverter method)
(dedoc.converters.PNGConverter method)
(dedoc.converters.PptxConverter method)
(dedoc.converters.TxtConverter method)
can_extract() (dedoc.attachments_extractors.AbstractAttachmentsExtractor method)
(dedoc.attachments_extractors.DocxAttachmentsExtractor method)
(dedoc.attachments_extractors.ExcelAttachmentsExtractor method)
(dedoc.attachments_extractors.JsonAttachmentsExtractor method)
(dedoc.attachments_extractors.PDFAttachmentsExtractor method)
(dedoc.attachments_extractors.PptxAttachmentsExtractor method)
(dedoc.metadata_extractors.AbstractMetadataExtractor method)
(dedoc.metadata_extractors.BaseMetadataExtractor method)
(dedoc.metadata_extractors.DocxMetadataExtractor method)
(dedoc.metadata_extractors.ImageMetadataExtractor method)
(dedoc.metadata_extractors.NoteMetadataExtractor method)
(dedoc.metadata_extractors.PdfMetadataExtractor method)
can_read() (dedoc.readers.ArchiveReader method)
(dedoc.readers.BaseReader method)
(dedoc.readers.CSVReader method)
(dedoc.readers.DocxReader method)
(dedoc.readers.EmailReader method)
(dedoc.readers.ExcelReader method)
(dedoc.readers.HtmlReader method)
(dedoc.readers.JsonReader method)
(dedoc.readers.MhtmlReader method)
(dedoc.readers.NoteReader method)
(dedoc.readers.PdfAutoReader method)
(dedoc.readers.PdfImageReader method)
(dedoc.readers.PdfTabbyReader method)
(dedoc.readers.PdfTxtlayerReader method)
(dedoc.readers.PptxReader method)
(dedoc.readers.RawTextReader method)
cells (dedoc.api.schema.Table attribute)
CellWithMeta (class in dedoc.api.schema)
(class in dedoc.data_structures)
ClassifyingLawStructureExtractor (class in dedoc.structure_extractors)
ColorAnnotation (class in dedoc.data_structures)
colspan (dedoc.api.schema.CellWithMeta attribute)
ConfidenceAnnotation (class in dedoc.data_structures)
content (dedoc.api.schema.ParsedDocument attribute)
create() (dedoc.data_structures.TreeNode static method)
create_raw_text() (dedoc.data_structures.HierarchyLevel static method)
create_root() (dedoc.data_structures.HierarchyLevel static method)
create_unknown() (dedoc.data_structures.HierarchyLevel static method)
created_time (dedoc.api.schema.DocumentMetadata attribute)
CSVReader (class in dedoc.readers)
D
DedocManager (class in dedoc)
DefaultStructureExtractor (class in dedoc.structure_extractors)
DiplomaStructureExtractor (class in dedoc.structure_extractors)
do_convert() (dedoc.converters.AbstractConverter method)
(dedoc.converters.BinaryConverter method)
(dedoc.converters.DocxConverter method)
(dedoc.converters.ExcelConverter method)
(dedoc.converters.PDFConverter method)
(dedoc.converters.PNGConverter method)
(dedoc.converters.PptxConverter method)
(dedoc.converters.TxtConverter method)
do_converting() (dedoc.converters.FileConverterComposition method)
document_type (dedoc.structure_extractors.ClassifyingLawStructureExtractor attribute)
(dedoc.structure_extractors.DefaultStructureExtractor attribute)
(dedoc.structure_extractors.DiplomaStructureExtractor attribute)
(dedoc.structure_extractors.FoivLawStructureExtractor attribute)
(dedoc.structure_extractors.LawStructureExtractor attribute)
(dedoc.structure_extractors.TzStructureExtractor attribute)
DocumentContent (class in dedoc.api.schema)
(class in dedoc.data_structures)
DocumentMetadata (class in dedoc.api.schema)
(class in dedoc.data_structures)
DocxAttachmentsExtractor (class in dedoc.attachments_extractors)
DocxConverter (class in dedoc.converters)
DocxMetadataExtractor (class in dedoc.metadata_extractors)
DocxReader (class in dedoc.readers)
E
EmailReader (class in dedoc.readers)
end (dedoc.api.schema.Annotation attribute)
ExcelAttachmentsExtractor (class in dedoc.attachments_extractors)
ExcelConverter (class in dedoc.converters)
ExcelReader (class in dedoc.readers)
extend_other_fields() (dedoc.data_structures.DocumentMetadata method)
(dedoc.data_structures.LineMetadata method)
extract_metadata() (dedoc.metadata_extractors.AbstractMetadataExtractor method)
(dedoc.metadata_extractors.BaseMetadataExtractor method)
(dedoc.metadata_extractors.DocxMetadataExtractor method)
(dedoc.metadata_extractors.ImageMetadataExtractor method)
(dedoc.metadata_extractors.MetadataExtractorComposition method)
(dedoc.metadata_extractors.NoteMetadataExtractor method)
(dedoc.metadata_extractors.PdfMetadataExtractor method)
extract_structure() (dedoc.structure_extractors.AbstractLawStructureExtractor method)
(dedoc.structure_extractors.AbstractStructureExtractor method)
(dedoc.structure_extractors.ClassifyingLawStructureExtractor method)
(dedoc.structure_extractors.DefaultStructureExtractor method)
(dedoc.structure_extractors.DiplomaStructureExtractor method)
(dedoc.structure_extractors.StructureExtractorComposition method)
(dedoc.structure_extractors.TzStructureExtractor method)
F
file_name (dedoc.api.schema.DocumentMetadata attribute)
file_type (dedoc.api.schema.DocumentMetadata attribute)
FileConverterComposition (class in dedoc.converters)
FoivLawStructureExtractor (class in dedoc.structure_extractors)
from_two_points() (dedocutils.data_structures.BBox static method)
G
get_annotations() (dedoc.data_structures.CellWithMeta method)
get_attachments() (dedoc.attachments_extractors.AbstractAttachmentsExtractor method)
(dedoc.attachments_extractors.DocxAttachmentsExtractor method)
(dedoc.attachments_extractors.ExcelAttachmentsExtractor method)
(dedoc.attachments_extractors.JsonAttachmentsExtractor method)
(dedoc.attachments_extractors.PDFAttachmentsExtractor method)
(dedoc.attachments_extractors.PptxAttachmentsExtractor method)
get_root() (dedoc.data_structures.TreeNode method)
get_text() (dedoc.data_structures.CellWithMeta method)
H
handle_attachments() (dedoc.attachments_handler.AttachmentsHandler method)
have_intersection_with_box() (dedocutils.data_structures.BBox method)
height (dedocutils.data_structures.BBox attribute)
HierarchyLevel (class in dedoc.data_structures)
HtmlReader (class in dedoc.readers)
I
ImageMetadataExtractor (class in dedoc.metadata_extractors)
IndentationAnnotation (class in dedoc.data_structures)
invisible (dedoc.api.schema.CellWithMeta attribute)
is_list_item() (dedoc.data_structures.HierarchyLevel method)
is_raw_text() (dedoc.data_structures.HierarchyLevel method)
is_unknown() (dedoc.data_structures.HierarchyLevel method)
ItalicAnnotation (class in dedoc.data_structures)
J
join() (dedoc.data_structures.LineWithMeta static method)
JsonAttachmentsExtractor (class in dedoc.attachments_extractors)
JsonReader (class in dedoc.readers)
L
LawStructureExtractor (class in dedoc.structure_extractors)
line (dedoc.data_structures.LineWithMeta property)
line_id (dedoc.api.schema.LineMetadata attribute)
LinearConstructor (class in dedoc.structure_constructors)
LineMetadata (class in dedoc.api.schema)
(class in dedoc.data_structures)
lines (dedoc.api.schema.CellWithMeta attribute)
LineWithMeta (class in dedoc.api.schema)
(class in dedoc.data_structures)
LinkedTextAnnotation (class in dedoc.data_structures)
M
metadata (dedoc.api.schema.ParsedDocument attribute)
(dedoc.api.schema.Table attribute)
(dedoc.api.schema.TreeNode attribute)
(dedoc.data_structures.LineWithMeta property)
MetadataExtractorComposition (class in dedoc.metadata_extractors)
MhtmlReader (class in dedoc.readers)
modified_time (dedoc.api.schema.DocumentMetadata attribute)
N
name (dedoc.api.schema.Annotation attribute)
(dedoc.data_structures.AlignmentAnnotation attribute)
(dedoc.data_structures.AttachAnnotation attribute)
(dedoc.data_structures.BBoxAnnotation attribute)
(dedoc.data_structures.BoldAnnotation attribute)
(dedoc.data_structures.ColorAnnotation attribute)
(dedoc.data_structures.ConfidenceAnnotation attribute)
(dedoc.data_structures.IndentationAnnotation attribute)
(dedoc.data_structures.ItalicAnnotation attribute)
(dedoc.data_structures.LinkedTextAnnotation attribute)
(dedoc.data_structures.SizeAnnotation attribute)
(dedoc.data_structures.SpacingAnnotation attribute)
(dedoc.data_structures.StrikeAnnotation attribute)
(dedoc.data_structures.StyleAnnotation attribute)
(dedoc.data_structures.SubscriptAnnotation attribute)
(dedoc.data_structures.SuperscriptAnnotation attribute)
(dedoc.data_structures.TableAnnotation attribute)
(dedoc.data_structures.UnderlinedAnnotation attribute)
node_id (dedoc.api.schema.TreeNode attribute)
NoteMetadataExtractor (class in dedoc.metadata_extractors)
NoteReader (class in dedoc.readers)
O
other_fields (dedoc.api.schema.DocumentMetadata attribute)
(dedoc.api.schema.LineMetadata attribute)
P
page_id (dedoc.api.schema.LineMetadata attribute)
(dedoc.api.schema.TableMetadata attribute)
paragraph_type (dedoc.api.schema.LineMetadata attribute)
parse() (dedoc.DedocManager method)
parse_file() (dedoc.readers.ReaderComposition method)
ParsedDocument (class in dedoc.api.schema)
(class in dedoc.data_structures)
PDFAttachmentsExtractor (class in dedoc.attachments_extractors)
PdfAutoReader (class in dedoc.readers)
PdfBaseReader (class in dedoc.readers)
PDFConverter (class in dedoc.converters)
PdfImageReader (class in dedoc.readers)
PdfMetadataExtractor (class in dedoc.metadata_extractors)
PdfTabbyReader (class in dedoc.readers)
PdfTxtlayerReader (class in dedoc.readers)
PNGConverter (class in dedoc.converters)
PptxAttachmentsExtractor (class in dedoc.attachments_extractors)
PptxConverter (class in dedoc.converters)
PptxReader (class in dedoc.readers)
R
RawTextReader (class in dedoc.readers)
read() (dedoc.readers.ArchiveReader method)
(dedoc.readers.BaseReader method)
(dedoc.readers.CSVReader method)
(dedoc.readers.DocxReader method)
(dedoc.readers.EmailReader method)
(dedoc.readers.ExcelReader method)
(dedoc.readers.HtmlReader method)
(dedoc.readers.JsonReader method)
(dedoc.readers.MhtmlReader method)
(dedoc.readers.NoteReader method)
(dedoc.readers.PdfAutoReader method)
(dedoc.readers.PdfBaseReader method)
(dedoc.readers.PdfTabbyReader method)
(dedoc.readers.PptxReader method)
(dedoc.readers.RawTextReader method)
ReaderComposition (class in dedoc.readers)
rotated_angle (dedoc.api.schema.TableMetadata attribute)
rowspan (dedoc.api.schema.CellWithMeta attribute)
S
Serializable (class in dedoc.data_structures)
set_line() (dedoc.data_structures.LineWithMeta method)
size (dedoc.api.schema.DocumentMetadata attribute)
SizeAnnotation (class in dedoc.data_structures)
SpacingAnnotation (class in dedoc.data_structures)
split() (dedoc.data_structures.LineWithMeta method)
square (dedocutils.data_structures.BBox property)
start (dedoc.api.schema.Annotation attribute)
StrikeAnnotation (class in dedoc.data_structures)
structure (dedoc.api.schema.DocumentContent attribute)
structure_document() (dedoc.structure_constructors.AbstractStructureConstructor method)
(dedoc.structure_constructors.LinearConstructor method)
(dedoc.structure_constructors.StructureConstructorComposition method)
(dedoc.structure_constructors.TreeConstructor method)
StructureConstructorComposition (class in dedoc.structure_constructors)
StructureExtractorComposition (class in dedoc.structure_extractors)
StyleAnnotation (class in dedoc.data_structures)
subparagraphs (dedoc.api.schema.TreeNode attribute)
SubscriptAnnotation (class in dedoc.data_structures)
SuperscriptAnnotation (class in dedoc.data_structures)
T
Table (class in dedoc.api.schema)
(class in dedoc.data_structures)
TableAnnotation (class in dedoc.data_structures)
TableMetadata (class in dedoc.api.schema)
(class in dedoc.data_structures)
tables (dedoc.api.schema.DocumentContent attribute)
temporary_file_name (dedoc.api.schema.DocumentMetadata attribute)
text (dedoc.api.schema.LineWithMeta attribute)
(dedoc.api.schema.TreeNode attribute)
to_api_schema() (dedoc.data_structures.CellWithMeta method)
(dedoc.data_structures.DocumentContent method)
(dedoc.data_structures.DocumentMetadata method)
(dedoc.data_structures.LineMetadata method)
(dedoc.data_structures.LineWithMeta method)
(dedoc.data_structures.ParsedDocument method)
(dedoc.data_structures.Serializable method)
(dedoc.data_structures.Table method)
(dedoc.data_structures.TableMetadata method)
(dedoc.data_structures.TreeNode method)
TreeConstructor (class in dedoc.structure_constructors)
TreeNode (class in dedoc.api.schema)
(class in dedoc.data_structures)
TxtConverter (class in dedoc.converters)
TzStructureExtractor (class in dedoc.structure_extractors)
U
uid (dedoc.api.schema.DocumentMetadata attribute)
(dedoc.api.schema.TableMetadata attribute)
(dedoc.data_structures.LineWithMeta property)
UnderlinedAnnotation (class in dedoc.data_structures)
UnstructuredDocument (class in dedoc.data_structures)
V
value (dedoc.api.schema.Annotation attribute)
version (dedoc.api.schema.ParsedDocument attribute)
W
warnings (dedoc.api.schema.ParsedDocument attribute)
width (dedocutils.data_structures.BBox attribute)
with_attachments() (dedoc.attachments_extractors.AbstractAttachmentsExtractor static method)
X
x_bottom_right (dedocutils.data_structures.BBox attribute)
x_top_left (dedocutils.data_structures.BBox attribute)
Y
y_bottom_right (dedocutils.data_structures.BBox attribute)
y_top_left (dedocutils.data_structures.BBox attribute)