Source code for papers_without_code.processing
#!/usr/bin/env python
import logging
from typing import Any
from .custom_types import AuthorDetails, MinimalPaperDetails
from .search import _get_keywords
###############################################################################
log = logging.getLogger(__name__)
###############################################################################
def _get_title(data: dict[str, Any]) -> str:
    return data["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"][
        "title"
    ]["#text"]
def _get_authors(data: dict[str, Any]) -> list[AuthorDetails]:
    authors = []
    authors_list_data = data["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"][
        "analytic"
    ]["author"]
    for author_data in authors_list_data:
        authors.append(
            AuthorDetails(
                name_parts=[
                    author_data["persName"]["forename"]["#text"],
                    author_data["persName"]["surname"],
                ],
                email=author_data["email"],
                affiliation=author_data["affiliation"]["orgName"]["#text"],
            )
        )
    return authors
def _get_abstract(data: dict[str, Any]) -> str:
    return data["teiHeader"]["profileDesc"]["abstract"]["div"]["p"]
[docs]
def parse_grobid_data(
    grobid_data: dict[str, Any],
) -> MinimalPaperDetails:
    """
    Parse GROBID data into a bit more useful form.
    Parameters
    ----------
    grobid_data: Dict[str, Any]
        The data returned from GROBID after processing a PDF.
    Returns
    -------
    MinimalPaperDetails
        The parsed GROBID data.
    """
    # Construct keyword content
    title = _get_title(grobid_data)
    abstract = _get_abstract(grobid_data)
    keywords = _get_keywords(f"{title}\n\n{abstract}")
    # Parse
    return MinimalPaperDetails(
        title=title,
        authors=_get_authors(grobid_data),
        abstract=abstract,
        keywords=keywords,
    )