Source code for inspirehep.modules.records.checkers

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2018 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Records checkers."""

from __future__ import absolute_import, division, print_function

from collections import defaultdict

from sqlalchemy import type_coerce
from sqlalchemy.dialects.postgresql import JSONB

from invenio_records.models import RecordMetadata
from inspire_utils.record import get_value


[docs]def increase_cited_count(result, identifier, core):
    """Increases the number of times a reference with the same identifier has appeared"""
    if core:
        result[identifier] = (result[identifier][0] + 1, result[identifier][1])
    else:
        result[identifier] = (result[identifier][0], result[identifier][1] + 1)


[docs]def calculate_score_of_reference(counted_reference):
    """Given a tuple of the number of times cited by a core record and a non core record,
    calculate a score associated with a reference.

    The score is calculated giving five times more importance to core records"""
    _, (core_count, non_core_count) = counted_reference
    return core_count * 5 + non_core_count


[docs]def order_dictionary_into_list(result_dict):
    """Return ``result_dict`` as an ordered list of tuples"""
    sorted_list = sorted(result_dict.items(), key=calculate_score_of_reference, reverse=True)

    return sorted_list


[docs]def add_linked_ids(dois, arxiv_ids, linked_ids):
    """Increase the amount of times a paper with a specific doi
    has been cited by using its corresponding arxiv eprint and viceversa

    ``double_count`` is used to count the times that a doi and an arxiv eprint
    appear in the same paper so that we don't count them twice in the final result"""
    for (doi, arxiv_id), double_count in linked_ids.iteritems():

        total_count_core = dois[doi][0] + arxiv_ids[arxiv_id][0] - double_count[0]
        total_count_non_core = dois[doi][1] + arxiv_ids[arxiv_id][1] - double_count[1]

        dois[doi] = (total_count_core, total_count_non_core)
        arxiv_ids[arxiv_id] = (total_count_core, total_count_non_core)


[docs]def get_all_unlinked_references():
    """Return a list of dict, in which each dictionary corresponds to one reference object
    and the status of core or non core"""
    query = (
        RecordMetadata.query
        .filter(
            type_coerce(RecordMetadata.json, JSONB)['_collections']
            .contains(['Literature'])
        )
        .with_entities(RecordMetadata.json)
    )

    for record in query.yield_per(1000):
        core = record.json.get('core')
        for reference in record.json.get('references', []):
            if 'record' not in reference:
                yield {'core': core, 'reference': reference}


[docs]def check_unlinked_references():
    """Return two lists with the unlinked references that have a doi or an arxiv id.

    If the reference read has a doi or an arxiv id, it is stored in the data structure.
    Once all the data is read, it is ordered by most relevant to less relevant."""

    result_doi, result_arxiv = defaultdict(lambda: (0, 0)), defaultdict(lambda: (0, 0))
    linked_ids = defaultdict(lambda: (0, 0))

    data = get_all_unlinked_references()

    for reference in data:
        dois = get_value(reference, 'reference.reference.dois', [])
        arxiv_id = get_value(reference, 'reference.reference.arxiv_eprint')

        if arxiv_id and len(dois) > 0:
            for doi in dois:
                increase_cited_count(linked_ids, (doi, arxiv_id), reference["core"])

        for doi in dois:
            increase_cited_count(result_doi, doi, reference["core"])

        if arxiv_id:
            increase_cited_count(result_arxiv, arxiv_id, reference["core"])

    add_linked_ids(result_doi, result_arxiv, linked_ids)

    result_doi = order_dictionary_into_list(result_doi)
    result_arxiv = order_dictionary_into_list(result_arxiv)

    return result_doi, result_arxiv
Source code for inspirehep.modules.records.checkers

INSPIRE-HEP

Navigation

Related Topics