Source code for inspirehep.modules.workflows.tasks.refextract

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Workflow tasks using refextract API."""

from __future__ import absolute_import, division, print_function

from itertools import chain

from inspire_schemas.utils import (
    convert_old_publication_info_to_new,
    split_page_artid,
)
from inspire_utils.helpers import maybe_int
from inspire_utils.logging import getStackTraceLogger
from refextract import (
    extract_journal_reference,
    extract_references_from_file,
    extract_references_from_string,
)

from inspirehep.modules.workflows.utils import (
    ignore_timeout_error,
    timeout_with_config,
)

from inspirehep.utils.references import (
    local_refextract_kbs_path,
    map_refextract_to_schema,
)
from ..utils import with_debug_logging

LOGGER = getStackTraceLogger(__name__)


@with_debug_logging
[docs]def extract_journal_info(obj, eng): """Extract the journal information from ``pubinfo_freetext``. Runs ``extract_journal_reference`` on the ``pubinfo_freetext`` key of each ``publication_info``, if it exists, and uses the extracted information to populate the other keys. Args: obj: a workflow object. eng: a workflow engine. Returns: None """ if not obj.data.get('publication_info'): return for publication_info in obj.data['publication_info']: try: with local_refextract_kbs_path() as kbs_path: extracted_publication_info = extract_journal_reference( publication_info['pubinfo_freetext'], override_kbs_files=kbs_path, ) if not extracted_publication_info: continue if extracted_publication_info.get('title'): publication_info['journal_title'] = extracted_publication_info['title'] if extracted_publication_info.get('volume'): publication_info['journal_volume'] = extracted_publication_info['volume'] if extracted_publication_info.get('page'): page_start, page_end, artid = split_page_artid(extracted_publication_info['page']) if page_start: publication_info['page_start'] = page_start if page_end: publication_info['page_end'] = page_end if artid: publication_info['artid'] = artid if extracted_publication_info.get('year'): year = maybe_int(extracted_publication_info['year']) if year: publication_info['year'] = year except KeyError: pass obj.data['publication_info'] = convert_old_publication_info_to_new(obj.data['publication_info'])
@ignore_timeout_error(return_value=[]) @timeout_with_config('WORKFLOWS_REFEXTRACT_TIMEOUT')
[docs]def extract_references_from_pdf(filepath, source=None, custom_kbs_file=None): """Extract references from PDF and return in INSPIRE format.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_file( filepath, override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}', ) return map_refextract_to_schema(extracted_references, source=source)
@ignore_timeout_error(return_value=[]) @timeout_with_config('WORKFLOWS_REFEXTRACT_TIMEOUT')
[docs]def extract_references_from_text(text, source=None, custom_kbs_file=None): """Extract references from text and return in INSPIRE format.""" with local_refextract_kbs_path() as kbs_path: extracted_references = extract_references_from_string( text, override_kbs_files=kbs_path, reference_format=u'{title},{volume},{page}', ) return map_refextract_to_schema(extracted_references, source=source)
@ignore_timeout_error(return_value=[]) @timeout_with_config('WORKFLOWS_REFEXTRACT_TIMEOUT')
[docs]def extract_references_from_raw_refs(references, custom_kbs_file=None): """Extract references from raw references in reference list. Args: references(List[dict]): a schema-compliant ``references`` field. If an element already contains a structured reference (that is, a ``reference`` key), it is not modified. Otherwise, the contents of the ``raw_refs`` is extracted by ``refextract``. custom_kbs_file(dict): configuration for refextract knowledge bases. Returns: List[dict]: a schema-compliant ``references`` field, with all previously unextracted references extracted. """ return list(chain.from_iterable( extract_references_from_raw_ref(ref, custom_kbs_file=custom_kbs_file) for ref in references ))
[docs]def extract_references_from_raw_ref(reference, custom_kbs_file=None): """Extract references from raw references in reference element. Args: reference(dict): a schema-compliant element of the ``references`` field. If it already contains a structured reference (that is, a ``reference`` key), no further processing is done. Otherwise, the contents of the ``raw_refs`` is extracted by ``refextract``. custom_kbs_file(dict): configuration for refextract knowledge bases. Returns: List[dict]: a list of schema-compliant elements of the ``references`` field, with all previously unextracted references extracted. Note: This function returns a list of references because one raw reference might correspond to several references. """ if 'reference' in reference or 'raw_refs' not in reference: return [reference] text_raw_refs = [ref for ref in reference['raw_refs'] if ref['schema'] == 'text'] nontext_schemas = [ref['schema'] for ref in reference['raw_refs'] if ref['schema'] != 'text'] if nontext_schemas: LOGGER.error('Impossible to extract references from non-text raw_refs with schemas %s', nontext_schemas) return [reference] if len(text_raw_refs) > 1: LOGGER.error( 'More than one text raw reference in %s, taking first one, the others will be lost', text_raw_refs ) raw_ref = text_raw_refs[0] return extract_references_from_text( raw_ref['value'], source=raw_ref['source'], custom_kbs_file=custom_kbs_file )