Source code for dedoc.readers.article_reader.article_reader

from typing import Dict, List, Optional, Tuple

from bs4 import Tag
from numpy import ndarray

from dedoc.data_structures.annotation import Annotation
from dedoc.data_structures.attached_file import AttachedFile
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.data_structures.table import Table
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader


[docs]class ArticleReader(BaseReader):
    """
    This class is used for parsing scientific articles with .pdf extension using `GROBID <https://grobid.readthedocs.io/en/latest/>`_ system.
    """

    def __init__(self, config: Optional[dict] = None) -> None:
        import os
        from dedoc.extensions import recognized_extensions, recognized_mimes

        super().__init__(config=config, recognized_extensions=recognized_extensions.pdf_like_format, recognized_mimes=recognized_mimes.pdf_like_format)

        grobid_url = os.environ.get("GROBID_URL", "")
        if grobid_url:
            self.grobid_url = grobid_url
        else:
            self.grobid_url = f"http://{os.environ.get('GROBID_HOST', 'localhost')}:{os.environ.get('GROBID_PORT', '8070')}"
        self.url = f"{self.grobid_url}/api/processFulltextDocument"

        auth_key = os.environ.get("GROBID_AUTH_KEY", "")
        self.request_headers = {"Authorization": auth_key} if auth_key else {}
        self.grobid_is_alive = False

[docs]    def read(self, file_path: str, parameters: Optional[dict] = None) -> UnstructuredDocument:
        """
        The method calls the service GROBID method ``/api/processFulltextDocument`` and analyzes the result (format XML/TEI) of the recognized article
        using beautifulsoup library.
        As a result, the method fills the class :class:`~dedoc.data_structures.UnstructuredDocument`.
        Article reader adds additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
        The method extracts information about ``authors``, ``keywords``, ``bibliography items``, ``sections``, and ``tables``.
        In table cells, ``colspan`` attribute can be filled according to the GROBID's "cols" attribute.
        You can find more information about the extracted information from GROBID system on the page :ref:`article_structure`.

        Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
        """
        import requests
        from bs4 import BeautifulSoup

        with open(file_path, "rb") as file:
            files = {"input": file}
            try:
                response = requests.post(self.url, files=files, data={"teiCoordinates": "figure"}, headers=self.request_headers)
                if response.status_code != 200:
                    warning = f"GROBID returns code {response.status_code}."
                    self.logger.warning(warning)
                    return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])
            except requests.exceptions.ConnectionError as ex:
                warning = f"GROBID doesn't response. Check GROBID service on {self.url}. Exception' msg: {ex}"
                self.logger.warning(warning)
                return UnstructuredDocument(tables=[], lines=[], attachments=[], warnings=[warning])

            soup = BeautifulSoup(response.text, features="xml")
            lines = self.__parse_title(soup)

            if soup.biblStruct is not None:
                authors = soup.biblStruct.find_all("author")
                lines += [line for author in authors for line in self.__parse_author(author)]
            lines += self.__parse_keywords(soup.keywords)

            bib_lines, bib2uid = self.__parse_bibliography(soup)
            tables, table2uid = self.__parse_tables(soup)
            attachments, attachment2uid = self.__parse_images(soup, file_path, parameters)

            lines += self.__parse_text(soup, bib2uid, table2uid, attachment2uid)
            lines.extend(bib_lines)

            return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments, warnings=["use GROBID (version: 0.8.0)"])

[docs]    def can_read(self, file_path: Optional[str] = None, mime: Optional[str] = None, extension: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
        """
        Check if:

        * the document extension is suitable for this reader (.pdf);
        * parameter "document_type" is "article";
        * GROBID service is running on port 8070.

        Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
        """
        from dedoc.utils.parameter_utils import get_param_document_type

        if get_param_document_type(parameters) != "article":
            return False

        self.__update_grobid_alive(self.grobid_url, max_attempts=self.config.get("grobid_max_connection_attempts", 3))
        if not self.grobid_is_alive:
            return False

        return super().can_read(file_path=file_path, mime=mime, extension=extension)

    def __update_grobid_alive(self, grobid_url: str, max_attempts: int = 2) -> None:
        import time
        import requests

        if self.grobid_is_alive:
            return

        attempt = max_attempts
        while attempt > 0:
            try:
                response = requests.get(f"{grobid_url}/api/isalive", headers=self.request_headers)
                if response.status_code == 200:
                    self.logger.info(f"GROBID up on {grobid_url}.")
                    self.grobid_is_alive = True
                    return
            except requests.exceptions.ConnectionError as ex:
                self.logger.warning(f"GROBID doesn't response. Check GROBID service on {self.url}. Exception's msg: {ex}")
            time.sleep(5)
            attempt -= 1

        self.grobid_is_alive = False

    def __get_tag_by_hierarchy_path(self, source: Tag, hierarchy_path: List[str]) -> Optional[str]:
        cur_tag = source
        for path_item in hierarchy_path:
            cur_tag = cur_tag.find(path_item)
            if cur_tag is None:
                # tag not found
                return ""

        return ArticleReader.__tag2text(cur_tag)

    def __create_line(self, text: str, hierarchy_level_id: Optional[int] = None, paragraph_type: Optional[str] = None,
                      annotations: Optional[List[Annotation]] = None, other_fields: Optional[Dict] = None) -> LineWithMeta:
        from dedoc.data_structures.hierarchy_level import HierarchyLevel
        from dedoc.data_structures.line_metadata import LineMetadata

        # TODO check on improve
        if other_fields is None:
            other_fields = {}
        assert text is not None
        assert isinstance(text, str)

        if hierarchy_level_id is None or paragraph_type is None:
            hierarchy_level = HierarchyLevel.create_raw_text()
        else:
            hierarchy_level = HierarchyLevel(level_1=hierarchy_level_id, level_2=0, can_be_multiline=False, line_type=paragraph_type)

        return LineWithMeta(line=text,
                            metadata=LineMetadata(page_id=0, line_id=0, tag_hierarchy_level=hierarchy_level, **other_fields),
                            annotations=annotations)

    def __parse_affiliation(self, affiliation_tag: Tag) -> List[LineWithMeta]:
        lines = [self.__create_line(text=affiliation_tag.get("key"), hierarchy_level_id=2, paragraph_type="author_affiliation")]

        if affiliation_tag.orgName:
            lines.append(self.__create_line(text=self.__tag2text(affiliation_tag.orgName), hierarchy_level_id=3, paragraph_type="org_name"))

        if affiliation_tag.address:
            lines.append(self.__create_line(text=self.__remove_newlines(affiliation_tag.address).get_text(separator=", "),
                                            hierarchy_level_id=3,
                                            paragraph_type="address"))

        return lines

    def __parse_author(self, author_tag: Tag) -> List[LineWithMeta]:
        """
        Example:
        <author>
            <persname><forename type="first">Sonia</forename><surname>Belaïd</surname></persname>
            <affiliation key="aff0">
                <orgname type="institution">École Normale Supérieure</orgname>
                <address>
                    <addrline>45 rue dUlm</addrline>
                    <postcode>75005</postcode>
                    <settlement>Paris</settlement>
                </address>
            </affiliation>
            <affiliation key="aff1">
                <orgname type="institution">Thales Communications &amp; Security</orgname>
                <address>
                    <addrline>4 Avenue des Louvresses</addrline>
                    <postcode>92230</postcode>
                    <settlement>Gennevilliers</settlement>
                </address>
            </affiliation>
        </author>
        """
        lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="author")]

        first_name = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "forename"])
        if first_name:
            lines.append(self.__create_line(text=first_name, hierarchy_level_id=2, paragraph_type="author_first_name"))

        surname = self.__get_tag_by_hierarchy_path(author_tag, ["persName", "surname"])
        if surname:
            lines.append(self.__create_line(text=surname, hierarchy_level_id=2, paragraph_type="author_surname"))

        lines += [
            self.__create_line(text=email.get_text(), hierarchy_level_id=3, paragraph_type="email")
            for email in author_tag.find_all("email") if email
        ]

        affiliations = author_tag.find_all("affiliation")
        lines += [line for affiliation in affiliations for line in self.__parse_affiliation(affiliation)]

        return lines

    def __parse_keywords(self, keywords_tag: Tag) -> List[LineWithMeta]:
        """
        <keywords>
            <term>Multi-Object Tracking</term>
            <term>Data Association</term>
            <term>Survey</term>
        </keywords>
        """
        if keywords_tag is None:
            return []

        lines = [self.__create_line(text="", hierarchy_level_id=1, paragraph_type="keywords")]
        lines += [self.__create_line(text=item.text, hierarchy_level_id=2, paragraph_type="keyword") for item in keywords_tag.find_all("term")]
        return lines

    def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict, table2uid: dict, attachment2uid: dict) -> LineWithMeta:
        from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
        from dedoc.data_structures.concrete_annotations.reference_annotation import ReferenceAnnotation
        from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation

        text = ""
        start = 0
        annotations = []

        for subpart in content:
            if isinstance(subpart, Tag) and subpart.name == "ref":
                target = subpart.get("target")
                sub_text = subpart.string
                if subpart.get("type") == "bibr" and target in bib2uid:
                    annotations.append(ReferenceAnnotation(value=bib2uid[target], start=start, end=start + len(sub_text)))
                if subpart.get("type") == "table" and target in table2uid:
                    annotations.append(TableAnnotation(value=table2uid[target], start=start, end=start + len(sub_text)))
                if subpart.get("type") == "figure" and target in attachment2uid:
                    annotations.append(AttachAnnotation(attach_uid=attachment2uid[target], start=start, end=start + len(sub_text)))
            else:
                sub_text = subpart if isinstance(subpart, str) else ""

            text += sub_text
            start += len(sub_text)

        return self.__create_line(text=text, hierarchy_level_id=None, paragraph_type=None, annotations=annotations)

    def __parse_text(self, soup: Tag, bib2uid: dict, table2uid: dict, attachment2uid: dict) -> List[LineWithMeta]:
        """
        Example of section XML tag:
        <div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1.1">Preprocessing</head><p>...</p><p>...</p></div>
        """
        lines = []

        abstract = soup.find("abstract").p
        lines.append(self.__create_line(text="Abstract", hierarchy_level_id=1, paragraph_type="abstract"))
        lines.append(self.__create_line(text=self.__tag2text(abstract)))

        for part in soup.body.find_all("div"):
            lines.extend(self.__parse_section(part, bib2uid, table2uid, attachment2uid))

        for other_text_type in ("acknowledgement", "annex"):
            for text_tag in soup.find_all("div", attrs={"type": other_text_type}):
                for part in text_tag.find_all("div"):
                    lines.extend(self.__parse_section(part, bib2uid, table2uid, attachment2uid))

        return lines

    def __parse_section(self, section_tag: Tag, bib2uid: dict, table2uid: dict, attachment2uid: dict) -> List[LineWithMeta]:
        from dedoc.structure_extractors.feature_extractors.list_features.list_utils import get_dotted_item_depth

        lines = []
        number = section_tag.head.get("n") if section_tag.head else ""
        number = number + " " if number else ""
        section_depth = get_dotted_item_depth(number)
        section_depth = section_depth if section_depth > 0 else 1

        line_text = section_tag.head.string if section_tag.head else None
        if line_text is not None and len(line_text) > 0:
            lines.append(self.__create_line(text=number + line_text, hierarchy_level_id=section_depth, paragraph_type="section"))
        for subpart in section_tag.find_all("p"):
            if subpart.string is not None:
                lines.append(self.__create_line_with_refs(subpart.string + "\n", bib2uid, table2uid, attachment2uid))
            elif subpart.contents and len(subpart.contents) > 0:
                lines.append(self.__create_line_with_refs(subpart.contents, bib2uid, table2uid, attachment2uid))

        return lines

    @staticmethod
    def __tag2text(tag: Tag) -> str:
        return "" if not tag or not tag.string else tag.string

    def __parse_tables(self, soup: Tag) -> Tuple[List[Table], dict]:
        """
        Example Table with table's ref:
         -----------------------------------------------
            Table Reference Example:
            <ref type="table" target="#tab_0">1</ref>
            ...
            Table Example:
                <figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0">
                    <head>Table 1 .</head>
                    <label>1</label>
                    <figDesc>Performance of some illustrative AES implementations.</figDesc>
                    <table>
                        <row><cell>Software (8-bit)</cell><cell>code size</cell><cell>cycle</cell><cell>cost</cell><cell>physical</cell></row>
                        <row><cell>Implementations</cell><cell>(bytes)</cell><cell>count</cell><cell>function</cell><cell>assumptions</cell></row>
                        <row><cell>Unprotected [13]</cell><cell>1659</cell><cell>4557</cell><cell>7.560</cell><cell>-</cell></row>
                        ...
                    </table>
                </figure>
        """
        from dedoc.data_structures.cell_with_meta import CellWithMeta
        from dedoc.data_structures.table_metadata import TableMetadata

        tables = []
        table2uid = {}

        tag_tables = soup.find_all("figure", {"type": "table"})
        for table in tag_tables:
            table_cells = []
            head = table.contents[0] if len(table.contents) > 0 and isinstance(table.contents[0], str) else self.__tag2text(table.head)
            title = head + self.__tag2text(table.figDesc)
            for row in table.table.find_all("row"):
                row_cells = []
                for cell in row.find_all("cell"):
                    cell_text = self.__create_line(self.__tag2text(cell))
                    colspan = int(cell.get("cols", 1))
                    row_cells.append(CellWithMeta(lines=[cell_text], colspan=colspan))

                    if colspan > 1:
                        row_cells.extend([CellWithMeta(lines=[cell_text], invisible=True) for _ in range(colspan - 1)])

                table_cells.append(row_cells)

            # ignore empty tables
            if len(table_cells) == 0:
                continue

            tables.append(Table(cells=table_cells, metadata=TableMetadata(page_id=0, title=title)))
            table2uid[f'#{table.get("xml:id")}'] = tables[-1].metadata.uid

        return tables, table2uid

    def __parse_images(self, soup: Tag, file_path: str, parameters: Optional[dict]) -> Tuple[List[AttachedFile], dict]:
        """
        Example Figure with figure's ref:
        -----------------------------------------------
            Figure Reference Example:
                <ref type="figure" coords="2,444.07,632.21,4.98,8.74" target="#fig_0">1</ref>
            Figure Example:
                <figure
                    xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0" coords="3,151.56,211.52,312.23,7.89;3,136.68,115.84,338.92,75.24">
                    <head>Fig. 1 .</head>
                    <label>1</label>
                    <figDesc>Fig. 1. Stateful leakage-resilient PRG with N = 2 (left) and N = 256 (right).</figDesc>
                    <graphic coords="3,136.68,115.84,338.92,75.24" type="bitmap"/>
                </figure>
            List of PDF page sizes:
                <facsimile>
                    <surface n="1" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
                    <surface n="2" ulx="0.0" uly="0.0" lrx="612.0" lry="792.0"/>
                </facsimile>
        Documentation: https://grobid.readthedocs.io/en/latest/Coordinates-in-PDF/
        """
        import os
        import uuid
        import cv2
        from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments

        if not get_param_with_attachments(parameters):
            return [], {}

        attachments_dir = get_param_attachments_dir(parameters, file_path)
        need_content_analysis = get_param_need_content_analysis(parameters)
        try:
            page_sizes = [(float(item["lrx"]), float(item["lry"])) for item in soup.facsimile.find_all("surface")]
        except Exception as e:
            self.logger.warning(f"Exception {e} during attached images handling")
            return [], {}

        attachments, attachment2uid = [], {}
        figure_tags = soup.find_all("figure", {"type": None})
        for figure_tag in figure_tags:
            try:
                cropped = self.__get_image(figure_tag, file_path, page_sizes)
                if cropped is None:
                    continue

                uid = f"fig_{uuid.uuid1()}"
                file_name = f"{uid}.png"
                attachment_path = os.path.join(attachments_dir, file_name)
                cv2.imwrite(attachment_path, cropped)
                attachments.append(AttachedFile(original_name=file_name, tmp_file_path=attachment_path, need_content_analysis=need_content_analysis, uid=uid))
                attachment2uid[f'#{figure_tag.get("xml:id")}'] = attachments[-1].uid
            except Exception as e:
                self.logger.warning(f"Exception {e} during figure tag handling:\n{figure_tag}")

        return attachments, attachment2uid

    def __get_image(self, figure_tag: Tag, file_path: str, page_sizes: List[Tuple[float, float]]) -> Optional[ndarray]:
        """
        Crop the PDF page according to the figure's coordinates.
        Figure can consist of multiple sub-figures: we crop the union of all sub-figures.
        Example of the figure's coordinates: coords="3,151.56,211.52,312.23,7.89;3,136.68,115.84,338.92,75.24"
        """
        import math
        import numpy as np
        from pdf2image import convert_from_path

        if figure_tag.graphic is None:
            return None

        coords_list = figure_tag["coords"].split(";")
        prev_page_number, page_image = None, None
        x_min, x_max, y_min, y_max = np.inf, 0, np.inf, 0

        for coords_text in coords_list:
            # coords=[p, y, x, h, w], where p - page number, (x, y) - upper-left point, h - height, w - width
            coords = coords_text.split(",")
            page_number = int(coords[0])

            if prev_page_number is None:
                prev_page_number = page_number
                page_image = np.array(convert_from_path(file_path, first_page=page_number, last_page=page_number)[0])

            if page_number != prev_page_number:
                self.logger.warning("The figure is located on several pages: handle only the first page (we don't handle multi-page images)")
                break

            coords = [float(i) for i in coords[1:]]
            page_size = page_sizes[page_number - 1]
            actual_page_size = page_image.shape[1], page_image.shape[0]
            coords = [
                coords[0] / page_size[0] * actual_page_size[0], coords[1] / page_size[1] * actual_page_size[1],
                coords[2] / page_size[0] * actual_page_size[0], coords[3] / page_size[1] * actual_page_size[1]
            ]
            y, x, h, w = coords[0], coords[1], coords[2], coords[3]
            x1, x2, y1, y2 = math.floor(x), math.ceil(x + w), math.floor(y), math.ceil(y + h)
            x_min, x_max, y_min, y_max = min(x_min, x1), max(x_max, x2), min(y_min, y1), max(y_max, y2)

        cropped = page_image[x_min:x_max, y_min:y_max]
        return cropped

    def __parse_bibliography(self, soup: Tag) -> Tuple[List[LineWithMeta], dict]:
        """
        Reference Example:
            <ref type="bibr" target="#b5">[6]</ref>
            ...
            <listBibl>
            <biblStruct xml:id="b0">
                <analytic>
                    <title level="a" type="main">Leakage-resilient symmetric encryption via re-keying</title>
                    <author>
                        <persName><forename type="first">Michel</forename><surname>Abdalla</surname></persName>
                    </author>
                    <author>
                        <persName><forename type="first">Sonia</forename><surname>Belaïd</surname></persName>
                    </author>
                    <author>
                        <persName><forename type="first">Pierre-Alain</forename><surname>Fouque</surname></persName>
                    </author>
                </analytic>
                <monogr>
                    <title level="m">Bertoni and Coron</title>
                    <imprint>
                        <biblScope unit="volume">4</biblScope>
                        <biblScope unit="page" from="471" to="488" />
                    </imprint>
                </monogr>
            </biblStruct>
            <biblStruct xml:id="b1">
        """
        lines = []
        cites = {}  # bib_item_grobid_uid: line_uid

        # according GROBID description
        level_2_paragraph_type = {"a": "title", "j": "title_journal", "s": "title_series", "m": "title_conference_proceedings"}

        bibliography = soup.find("listBibl", recursive=True)
        lines.append(self.__create_line(text="bibliography", hierarchy_level_id=1, paragraph_type="bibliography"))
        if not bibliography:
            return lines, cites

        bib_items = bibliography.find_all("biblStruct")
        if not bib_items:
            return lines, cites

        # parse bibliography items
        for bib_item in bib_items:
            cites["#" + bib_item.get("xml:id")] = lines[-1].uid
            lines.append(self.__create_line(text="", hierarchy_level_id=2, paragraph_type="bibliography_item", other_fields={"uid": lines[-1].uid}))

            # parse bib title
            for title in bib_item.find_all("title", recursive=True):
                if title.get("level"):
                    paragraph_type = level_2_paragraph_type[title.get("level")]
                    lines.append(self.__create_line(text=self.__tag2text(title), hierarchy_level_id=3, paragraph_type=paragraph_type))

            lines += [  # parse bib authors
                self.__create_line(text=self.__remove_newlines(author).get_text(separator=" "), hierarchy_level_id=3, paragraph_type="author")
                for author in bib_item.find_all("author", recursive=True) if author
            ]

            lines += [  # parse biblScope <biblScope unit="volume">
                self.__create_line(text=self.__tag2text(bibl_scope), hierarchy_level_id=3, paragraph_type="biblScope_volume")
                for bibl_scope in bib_item.find_all("biblScope", {"unit": "volume"}, recursive=True) if bibl_scope
            ]

            try:
                lines += [  # parse <biblScope unit="page"> values
                    self.__create_line(text=f"{bibl_scope.get('from')}-{bibl_scope.get('to')}", hierarchy_level_id=3, paragraph_type="biblScope_page")
                    for bibl_scope in bib_item.find_all("biblScope", {"unit": "page"}, recursive=True) if bibl_scope
                ]
            finally:
                self.logger.warning("Grobid parsing warning: <biblScope unit='page' ... /> was non-standard format")

            lines += [  # parse DOI (maybe more one)
                self.__create_line(text=self.__tag2text(idno), hierarchy_level_id=3, paragraph_type="DOI")
                for idno in bib_item.find_all("idno", recursive=True) if idno
            ]

            if bib_item.publisher:
                lines.append(self.__create_line(text=self.__tag2text(bib_item.publisher), hierarchy_level_id=3, paragraph_type="publisher"))

            if bib_item.date:
                lines.append(self.__create_line(text=self.__tag2text(bib_item.date), hierarchy_level_id=3, paragraph_type="date"))

        return lines, cites

    def __parse_title(self, soup: Tag) -> List[LineWithMeta]:
        return [self.__create_line(text=self.__tag2text(soup.title), hierarchy_level_id=0, paragraph_type="root")]

    def __remove_newlines(self, tag: Tag) -> Tag:
        for item in tag:
            if not isinstance(item, Tag):
                item.extract()
        return tag