Source code for inspirehep.modules.orcid.putcode_getter

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2018 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
from __future__ import absolute_import, division, print_function

import itertools
import logging
import re

from flask import current_app

from inspire_service_orcid import exceptions as orcid_client_exceptions
from inspire_service_orcid.client import OrcidClient
from inspire_service_orcid import utils as inspire_service_orcid_utils

from inspirehep.modules.orcid.converter import ExternalIdentifier
from inspirehep.modules.records.utils import get_pid_from_record_uri
from . import exceptions, utils


INSPIRE_WORK_URL_REGEX = re.compile(
    r'https?://(?:labs\.)?inspirehep\.net/(?:record|literature)/(\d+)',
    re.IGNORECASE
)


logger = logging.getLogger(__name__)


[docs]class OrcidPutcodeGetter(object): def __init__(self, orcid, oauth_token): self.orcid = orcid self.oauth_token = oauth_token self.client = OrcidClient(self.oauth_token, self.orcid) self.source_client_id_path = current_app.config['ORCID_APP_CREDENTIALS'][ 'consumer_key']
[docs] def get_all_inspire_putcodes_and_recids_iter(self): """ Query ORCID api and get all the Inspire putcodes for the given ORCID. """ summary_response = self._get_all_works_summary() # `putcodes_recids` is a list like: [('43326850', 20), ('43255490', None)] putcodes_recids = list(summary_response.get_putcodes_and_recids_for_source_iter( self.source_client_id_path)) putcodes_with_recids = [x for x in putcodes_recids if x[1]] putcodes_without_recids = [x[0] for x in putcodes_recids if not x[1]] for putcode, recid in putcodes_with_recids: yield putcode, recid if not putcodes_without_recids: return for putcode, recid in self._get_putcodes_and_recids_iter(putcodes_without_recids): yield putcode, recid
def _get_all_works_summary(self): """ Query ORCID api and get all the putcodes with their embedded recids for the given ORCID. An embedded recid is a recid written as external-identifier. """ response = self.client.get_all_works_summary() utils.log_service_response(logger, response, 'in OrcidPutcodeGetter works summary') try: response.raise_for_result() except orcid_client_exceptions.BaseOrcidClientJsonException as exc: raise exceptions.InputDataInvalidException(from_exc=exc) return response def _get_putcodes_and_recids_iter(self, putcodes): for putcode, url in self._get_urls_for_putcodes_iter(putcodes): # Filter out putcodes that do not belong to Inspire. if INSPIRE_WORK_URL_REGEX.match(url): recid = get_pid_from_record_uri(url)[1] if not recid: logger.error('OrcidPutcodeGetter: cannot parse recid from url={} for orcid={}'.format( url, self.orcid)) continue yield putcode, recid def _get_urls_for_putcodes_iter(self, putcodes): # The call `get_bulk_works_details_iter()` can be expensive for an # author with many works (if each work also has many *contributors*). # Fi. for an ATLAS author with ~750 works (each of them with many # authors), 8 calls would be performed with a total data transfer > 0.5 Gb. chained = [] for response in self.client.get_bulk_works_details_iter(putcodes): # Note: this log can be large. Consider removing it when this part # is considered mature. utils.log_service_response(logger, response, 'in OrcidPutcodeGetter works details') try: response.raise_for_result() except orcid_client_exceptions.BaseOrcidClientJsonException as exc: raise exceptions.InputDataInvalidException(from_exc=exc) chained = itertools.chain(chained, response.get_putcodes_and_urls_iter()) return chained
[docs] def get_putcodes_and_recids_by_identifiers_iter(self, identifiers): """ Yield putcode and recid for each work matched by the external identifiers. Note: external identifiers of type 'other-id' are skipped. Args: identifiers (List[inspirehep.modules.orcid.converter.ExternalIdentifier]): list af all external identifiers added after the xml conversion. """ summary_response = self._get_all_works_summary() for putcode, ids in summary_response.get_putcodes_and_external_identifiers_iter(): # ids is a list like: # [ # {'external-id-relationship': 'SELF', # 'external-id-type': 'other-id', # 'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'}, # 'external-id-value': '20' # },... # ] # Get the recid. recid = self._get_recid_for_work(ids, str(putcode)) for identifier in ids: id_type = identifier.get('external-id-type') # We are interested only in doi, arxiv, isbns. if not id_type or id_type.lower() == 'other-id': continue id_value = identifier.get('external-id-value') if not id_value: continue if ExternalIdentifier(id_type, id_value) in identifiers: yield putcode, recid
def _get_recid_for_work(self, external_identifiers, putcode): """ Get the recid for a work given its external identifiers and putcode. The recid might be in the external identifiers or a get_work_details() might be called to find it. Args: external_identifier (List[Dict]): a list like: [ {'external-id-relationship': 'SELF', 'external-id-type': 'other-id', 'external-id-url': {'value': 'http://inspireheptest.cern.ch/record/20'}, 'external-id-value': '20' },... ] putcode: putcode of the given work. Returns: the Inspire recid mathcing the work. """ for identifier in external_identifiers: id_type = identifier.get('external-id-type') if not id_type or id_type.lower() != 'other-id': continue id_url = inspire_service_orcid_utils.smartget(identifier, 'external-id-url.value', '') if not re.match(r'.*inspire.*', id_url, re.I): continue id_value = identifier.get('external-id-value') if not id_value: continue # recid found. return id_value # The recid was not found in the external_identifiers. # Thus we call get_bulk_works_details_iter(). putcodes_recid = list(self._get_putcodes_and_recids_iter([putcode])) if putcodes_recid: return putcodes_recid[0][1]