Source code for inspirehep.modules.records.checkers

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2018 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Records checkers."""

from __future__ import absolute_import, division, print_function

from collections import defaultdict

from sqlalchemy import type_coerce
from sqlalchemy.dialects.postgresql import JSONB

from invenio_records.models import RecordMetadata
from inspire_utils.record import get_value


[docs]def increase_cited_count(result, identifier, core): """Increases the number of times a reference with the same identifier has appeared""" if core: result[identifier] = (result[identifier][0] + 1, result[identifier][1]) else: result[identifier] = (result[identifier][0], result[identifier][1] + 1)
[docs]def calculate_score_of_reference(counted_reference): """Given a tuple of the number of times cited by a core record and a non core record, calculate a score associated with a reference. The score is calculated giving five times more importance to core records""" _, (core_count, non_core_count) = counted_reference return core_count * 5 + non_core_count
[docs]def order_dictionary_into_list(result_dict): """Return ``result_dict`` as an ordered list of tuples""" sorted_list = sorted(result_dict.items(), key=calculate_score_of_reference, reverse=True) return sorted_list
[docs]def add_linked_ids(dois, arxiv_ids, linked_ids): """Increase the amount of times a paper with a specific doi has been cited by using its corresponding arxiv eprint and viceversa ``double_count`` is used to count the times that a doi and an arxiv eprint appear in the same paper so that we don't count them twice in the final result""" for (doi, arxiv_id), double_count in linked_ids.iteritems(): total_count_core = dois[doi][0] + arxiv_ids[arxiv_id][0] - double_count[0] total_count_non_core = dois[doi][1] + arxiv_ids[arxiv_id][1] - double_count[1] dois[doi] = (total_count_core, total_count_non_core) arxiv_ids[arxiv_id] = (total_count_core, total_count_non_core)
[docs]def get_all_unlinked_references(): """Return a list of dict, in which each dictionary corresponds to one reference object and the status of core or non core""" query = ( RecordMetadata.query .filter( type_coerce(RecordMetadata.json, JSONB)['_collections'] .contains(['Literature']) ) .with_entities(RecordMetadata.json) ) for record in query.yield_per(1000): core = record.json.get('core') for reference in record.json.get('references', []): if 'record' not in reference: yield {'core': core, 'reference': reference}
[docs]def check_unlinked_references(): """Return two lists with the unlinked references that have a doi or an arxiv id. If the reference read has a doi or an arxiv id, it is stored in the data structure. Once all the data is read, it is ordered by most relevant to less relevant.""" result_doi, result_arxiv = defaultdict(lambda: (0, 0)), defaultdict(lambda: (0, 0)) linked_ids = defaultdict(lambda: (0, 0)) data = get_all_unlinked_references() for reference in data: dois = get_value(reference, 'reference.reference.dois', []) arxiv_id = get_value(reference, 'reference.reference.arxiv_eprint') if arxiv_id and len(dois) > 0: for doi in dois: increase_cited_count(linked_ids, (doi, arxiv_id), reference["core"]) for doi in dois: increase_cited_count(result_doi, doi, reference["core"]) if arxiv_id: increase_cited_count(result_arxiv, arxiv_id, reference["core"]) add_linked_ids(result_doi, result_arxiv, linked_ids) result_doi = order_dictionary_into_list(result_doi) result_arxiv = order_dictionary_into_list(result_arxiv) return result_doi, result_arxiv