Source code for papers_without_code.processing
#!/usr/bin/env python
import logging
from typing import Any
from .custom_types import AuthorDetails, MinimalPaperDetails
from .search import _get_keywords
###############################################################################
log = logging.getLogger(__name__)
###############################################################################
def _get_title(data: dict[str, Any]) -> str:
return data["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"]["analytic"][
"title"
]["#text"]
def _get_authors(data: dict[str, Any]) -> list[AuthorDetails]:
authors = []
authors_list_data = data["teiHeader"]["fileDesc"]["sourceDesc"]["biblStruct"][
"analytic"
]["author"]
for author_data in authors_list_data:
authors.append(
AuthorDetails(
name_parts=[
author_data["persName"]["forename"]["#text"],
author_data["persName"]["surname"],
],
email=author_data["email"],
affiliation=author_data["affiliation"]["orgName"]["#text"],
)
)
return authors
def _get_abstract(data: dict[str, Any]) -> str:
return data["teiHeader"]["profileDesc"]["abstract"]["div"]["p"]
[docs]
def parse_grobid_data(
grobid_data: dict[str, Any],
) -> MinimalPaperDetails:
"""
Parse GROBID data into a bit more useful form.
Parameters
----------
grobid_data: Dict[str, Any]
The data returned from GROBID after processing a PDF.
Returns
-------
MinimalPaperDetails
The parsed GROBID data.
"""
# Construct keyword content
title = _get_title(grobid_data)
abstract = _get_abstract(grobid_data)
keywords = _get_keywords(f"{title}\n\n{abstract}")
# Parse
return MinimalPaperDetails(
title=title,
authors=_get_authors(grobid_data),
abstract=abstract,
keywords=keywords,
)