# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
""" Record related utils."""
from __future__ import absolute_import, division, print_function
from itertools import chain
from unicodedata import normalize
import re
import six
from inspire_dojson.utils import get_recid_from_ref
from inspire_utils.date import earliest_date
from inspire_utils.name import generate_name_variations, ParsedName
from inspire_utils.record import get_value
from inspire_utils.helpers import force_list
from inspirehep.modules.pidstore.utils import (
get_endpoint_from_pid_type,
get_pid_type_from_schema
)
from inspirehep.modules.records.errors import MissingInspireRecordError
from inspirehep.utils.record_getter import get_db_records
from inspire_utils.record import get_values_for_schema
[docs]def is_author(record):
return 'authors.json' in record.get('$schema')
[docs]def is_hep(record):
return 'hep.json' in record.get('$schema')
[docs]def is_data(record):
return 'data.json' in record.get('$schema')
[docs]def is_institution(record):
return 'institutions.json' in record.get('$schema')
[docs]def is_experiment(record):
return 'experiments.json' in record.get('$schema')
[docs]def is_journal(record):
return 'journals.json' in record.get('$schema')
[docs]def is_book(record):
return 'book' in record.get('document_type', [])
[docs]def get_endpoint_from_record(record):
"""Return the endpoint corresponding to a record."""
pid_type = get_pid_type_from_schema(record['$schema'])
endpoint = get_endpoint_from_pid_type(pid_type)
return endpoint
[docs]def get_pid_from_record_uri(record_uri):
"""Transform a URI to a record into a (pid_type, pid_value) pair."""
parts = [part for part in record_uri.split('/') if part]
try:
pid_type = parts[-2][:3]
pid_value = parts[-1]
except IndexError:
return None
return pid_type, pid_value
[docs]def get_author_display_name(name):
"""Returns the display name in format Firstnames Lastnames"""
parsed_name = ParsedName.loads(name)
return " ".join(parsed_name.first_list + parsed_name.last_list)
[docs]def get_linked_records_in_field(record, field_path):
"""Get all linked records in a given field.
Args:
record (dict): the record containing the links
field_path (string): a dotted field path specification understandable
by ``get_value``, containing a json reference to another record.
Returns:
Iterator[dict]: an iterator on the linked record.
Warning:
Currently, the order in which the linked records are yielded is
different from the order in which they appear in the record.
Example:
>>> record = {'references': [
... {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}},
... {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}},
... ]}
>>> get_linked_record_in_field(record, 'references.record')
[...]
"""
full_path = '.'.join([field_path, '$ref'])
pids = force_list([get_pid_from_record_uri(rec) for rec in get_value(record, full_path, [])])
return get_db_records(pids)
[docs]def populate_earliest_date(record):
"""Populate the ``earliest_date`` field of Literature records."""
date_paths = [
'preprint_date',
'thesis_info.date',
'thesis_info.defense_date',
'publication_info.year',
'legacy_creation_date',
'imprints.date',
]
dates = [
str(el) for el in chain.from_iterable(
[force_list(get_value(record, path)) for path in date_paths]
)
]
if dates:
result = earliest_date(dates)
if result:
record['earliest_date'] = result
[docs]def populate_citations_count(record):
"""Populate citations_count in ES from"""
if hasattr(record, 'get_citations_count'):
# Make sure that record has method get_citations_count
# Session is in commited state here, and I cannot open new one...
citation_count = record.get_citations_count()
record['citation_count'] = citation_count
else:
raise MissingInspireRecordError("Record is not InspireRecord!")
[docs]def populate_bookautocomplete(record):
"""Populate the ```bookautocomplete`` field of Literature records."""
paths = [
'imprints.date',
'imprints.publisher',
'isbns.value',
]
authors = force_list(get_value(record, 'authors.full_name', default=[]))
titles = force_list(get_value(record, 'titles.title', default=[]))
input_values = list(chain.from_iterable(
force_list(get_value(record, path, default=[])) for path in paths))
input_values.extend(authors)
input_values.extend(titles)
input_values = [el for el in input_values if el]
record['bookautocomplete'] = {
'input': input_values,
}
[docs]def populate_inspire_document_type(record):
"""Populate the ``facet_inspire_doc_type`` field of Literature records."""
result = []
result.extend(record.get('document_type', []))
result.extend(record.get('publication_type', []))
if 'refereed' in record and record['refereed']:
result.append('peer reviewed')
record['facet_inspire_doc_type'] = result
[docs]def populate_recid_from_ref(record):
"""Extract recids from all JSON reference fields and add them to ES.
For every field that has as a value a JSON reference, adds a sibling
after extracting the record identifier. Siblings are named by removing
``record`` occurrences and appending ``_recid`` without doubling or
prepending underscores to the original name.
Example::
{'record': {'$ref': 'http://x/y/2}}
is transformed to::
{
'recid': 2,
'record': {'$ref': 'http://x/y/2},
}
For every list of object references adds a new list with the
corresponding recids, whose name is similarly computed.
Example::
{
'records': [
{'$ref': 'http://x/y/1'},
{'$ref': 'http://x/y/2'},
],
}
is transformed to::
{
'recids': [1, 2],
'records': [
{'$ref': 'http://x/y/1'},
{'$ref': 'http://x/y/2'},
],
}
"""
list_ref_fields_translations = {
'deleted_records': 'deleted_recids'
}
def _recursive_find_refs(json_root):
if isinstance(json_root, list):
items = enumerate(json_root)
elif isinstance(json_root, dict):
# Note that items have to be generated before altering the dict.
# In this case, iteritems might break during iteration.
items = json_root.items()
else:
items = []
for key, value in items:
if (isinstance(json_root, dict) and isinstance(value, dict) and '$ref' in value):
# Append '_recid' and remove 'record' from the key name.
key_basename = key.replace('record', '').rstrip('_')
new_key = '{}_recid'.format(key_basename).lstrip('_')
json_root[new_key] = get_recid_from_ref(value)
elif (isinstance(json_root, dict) and isinstance(value, list) and
key in list_ref_fields_translations):
new_list = [get_recid_from_ref(v) for v in value]
new_key = list_ref_fields_translations[key]
json_root[new_key] = new_list
else:
_recursive_find_refs(value)
_recursive_find_refs(record)
[docs]def populate_abstract_source_suggest(record):
"""Populate the ``abstract_source_suggest`` field in Literature records."""
abstracts = record.get('abstracts', [])
for abstract in abstracts:
source = abstract.get('source')
if source:
abstract.update({
'abstract_source_suggest': {
'input': source,
},
})
[docs]def populate_title_suggest(record):
"""Populate the ``title_suggest`` field of Journals records."""
journal_title = get_value(record, 'journal_title.title', default='')
short_title = record.get('short_title', '')
title_variants = record.get('title_variants', [])
input_values = []
input_values.append(journal_title)
input_values.append(short_title)
input_values.extend(title_variants)
input_values = [el for el in input_values if el]
record['title_suggest'] = {
'input': input_values,
}
[docs]def populate_affiliation_suggest(record):
"""Populate the ``affiliation_suggest`` field of Institution records."""
ICN = record.get('ICN', [])
institution_acronyms = get_value(record, 'institution_hierarchy.acronym', default=[])
institution_names = get_value(record, 'institution_hierarchy.name', default=[])
legacy_ICN = record.get('legacy_ICN', '')
name_variants = force_list(get_value(record, 'name_variants.value', default=[]))
postal_codes = force_list(get_value(record, 'addresses.postal_code', default=[]))
# XXX: this is need by the curators to search only with numbers
extract_numbers_from_umr = []
for name in name_variants:
match = re.match(r'UMR\s', name, re.IGNORECASE)
if match:
umr_number = name.replace(match.group(0), '')
extract_numbers_from_umr.append(umr_number)
input_values = []
input_values.extend(ICN)
input_values.extend(institution_acronyms)
input_values.extend(institution_names)
input_values.append(legacy_ICN)
input_values.extend(name_variants)
input_values.extend(postal_codes)
input_values.extend(extract_numbers_from_umr)
input_values = [el for el in input_values if el]
record['affiliation_suggest'] = {
'input': input_values,
}
[docs]def populate_experiment_suggest(record):
"""Populates experiment_suggest field of experiment records."""
experiment_paths = [
'accelerator.value',
'collaboration.value',
'experiment.short_name',
'experiment.value',
'institutions.value',
'legacy_name',
'long_name',
'name_variants',
]
input_values = [el for el in chain.from_iterable(
[force_list(get_value(record, path)) for path in experiment_paths]) if el]
record['experiment_suggest'] = {
'input': input_values,
}
[docs]def populate_name_variations(record):
"""Generate name variations for each signature of a Literature record."""
authors = record.get('authors', [])
for author in authors:
full_name = author.get('full_name')
if full_name:
name_variations = generate_name_variations(full_name)
author.update({'name_variations': name_variations})
author.update({'name_suggest': {
'input': [variation for variation in name_variations if variation],
}})
[docs]def populate_number_of_references(record):
"""Generate name variations for each signature of a Literature record."""
references = record.get('references')
if references is not None:
record['number_of_references'] = len(references)
[docs]def populate_authors_name_variations(record):
"""Generate name variations for an Author record."""
author_name = get_value(record, 'name.value')
if author_name:
name_variations = generate_name_variations(author_name)
record['name_variations'] = name_variations
[docs]def populate_author_count(record):
"""Populate the ``author_count`` field of Literature records."""
authors = record.get('authors', [])
authors_excluding_supervisors = [
author for author in authors
if 'supervisor' not in author.get('inspire_roles', [])
]
record['author_count'] = len(authors_excluding_supervisors)
[docs]def populate_authors_full_name_unicode_normalized(record):
"""Populate the ``authors.full_name_normalized`` field of Literature records."""
authors = record.get('authors', [])
for index, author in enumerate(authors):
full_name = six.text_type(author['full_name'])
record['authors'][index].update({
'full_name_unicode_normalized': normalize('NFKC', full_name).lower()
})
[docs]def get_author_with_record_facet_author_name(author):
author_ids = author.get('ids', [])
author_bai = get_values_for_schema(author_ids, 'INSPIRE BAI')
bai = author_bai[0] if author_bai else 'BAI'
author_preferred_name = get_value(author, 'name.preferred_name')
if author_preferred_name:
return u'{}_{}'.format(bai, author_preferred_name)
else:
return u'{}_{}'.format(bai, get_author_display_name(author['name']['value']))
[docs]def populate_facet_author_name(record):
"""Populate the ``facet_author_name`` field of Literature records."""
authors_with_record = get_linked_records_in_field(record, 'authors.record')
authors_without_record = [author for author in record.get('authors', []) if 'record' not in author]
result = []
for author in authors_with_record:
result.append(get_author_with_record_facet_author_name(author))
for author in authors_without_record:
result.append(u'BAI_{}'.format(get_author_display_name(author['full_name'])))
record['facet_author_name'] = result
[docs]def populate_author_suggest(record, *args, **kwargs):
"""Populate the ``author_suggest`` field of Authors records."""
author_paths = [
'name.preferred_name',
'name.previous_names',
'name.name_variants',
'name.native_names',
'name.value',
]
input_values = [el for el in chain.from_iterable([force_list(get_value(record, path)) for path in author_paths])]
record['author_suggest'] = {
'input': input_values
}