Source code for inspirehep.modules.refextract.matcher

# -*- coding: utf-8 -*-
# This file is part of INSPIRE.
# Copyright (C) 2018 CERN.
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <>.
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

from __future__ import absolute_import, division, print_function

from inspire_dojson.utils import get_record_ref, get_recid_from_ref
from inspire_matcher import match
from inspire_utils.dedupers import dedupe_list
from inspire_utils.record import get_value

from inspirehep.modules.refextract import config

def _add_match_to_reference(reference, matched_recid, es_index):
    """Modifies a reference to include its record id."""
    if es_index == 'records-data':
        reference['record'] = get_record_ref(matched_recid, 'data')
    elif es_index == 'records-hep':
        reference['record'] = get_record_ref(matched_recid, 'literature')

[docs]def match_reference_with_config(reference, config, previous_matched_recid=None): """Match a reference using inspire-matcher given the config. Args: reference (dict): the metadata of the reference. config (dict): the list of inspire-matcher configurations for queries. previous_matched_recid (int): the record id of the last matched reference from the list of references. Returns: dict: the matched reference. """ # XXX: avoid this type casting. try: reference['reference']['publication_info']['year'] = str( reference['reference']['publication_info']['year']) except KeyError: pass matched_recids = [matched_record['_source']['control_number'] for matched_record in match(reference, config)] matched_recids = dedupe_list(matched_recids) same_as_previous = any(matched_recid == previous_matched_recid for matched_recid in matched_recids) if len(matched_recids) == 1: _add_match_to_reference(reference, matched_recids[0], config['index']) elif same_as_previous: _add_match_to_reference(reference, previous_matched_recid, config['index']) # XXX: avoid this type casting. try: reference['reference']['publication_info']['year'] = int( reference['reference']['publication_info']['year']) except KeyError: pass return reference
[docs]def match_reference(reference, previous_matched_recid=None): """Match a reference using inspire-matcher. Args: reference (dict): the metadata of a reference. previous_matched_recid (int): the record id of the last matched reference from the list of references. Returns: dict: the matched reference. """ if reference.get('curated_relation'): return reference config_unique_identifiers = config.REFERENCE_MATCHER_UNIQUE_IDENTIFIERS_CONFIG config_default_publication_info = config.REFERENCE_MATCHER_DEFAULT_PUBLICATION_INFO_CONFIG config_jcap_and_jhep_publication_info = config.REFERENCE_MATCHER_JHEP_AND_JCAP_PUBLICATION_INFO_CONFIG config_data = config.REFERENCE_MATCHER_DATA_CONFIG journal_title = get_value(reference, 'reference.publication_info.journal_title') config_publication_info = config_jcap_and_jhep_publication_info if \ journal_title in ['JCAP', 'JHEP'] else config_default_publication_info configs = [config_unique_identifiers, config_publication_info, config_data] matches = (match_reference_with_config(reference, config, previous_matched_recid) for config in configs) matches = (matched_record for matched_record in matches if 'record' in matched_record) reference = next(matches, reference) return reference
[docs]def match_references(references): """Match references to their respective records in INSPIRE. Args: references (list): the list of references. Returns: list: the matched references. """ matched_references, previous_matched_recid = [], None for ref in references: ref = match_reference(ref, previous_matched_recid) matched_references.append(ref) if 'record' in ref: previous_matched_recid = get_recid_from_ref(ref['record']) return matched_references