Source code for inspirehep.modules.records.utils

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

""" Record related utils."""

from __future__ import absolute_import, division, print_function

from itertools import chain
from unicodedata import normalize
import re
import six

from inspire_dojson.utils import get_recid_from_ref
from inspire_utils.date import earliest_date
from inspire_utils.name import generate_name_variations, ParsedName
from inspire_utils.record import get_value
from inspire_utils.helpers import force_list

from inspirehep.modules.pidstore.utils import (
    get_endpoint_from_pid_type,
    get_pid_type_from_schema
)
from inspirehep.modules.records.errors import MissingInspireRecordError
from inspirehep.utils.record_getter import get_db_records
from inspire_utils.record import get_values_for_schema


[docs]def is_author(record):
    return 'authors.json' in record.get('$schema')


[docs]def is_hep(record):
    return 'hep.json' in record.get('$schema')


[docs]def is_data(record):
    return 'data.json' in record.get('$schema')


[docs]def is_institution(record):
    return 'institutions.json' in record.get('$schema')


[docs]def is_experiment(record):
    return 'experiments.json' in record.get('$schema')


[docs]def is_journal(record):
    return 'journals.json' in record.get('$schema')


[docs]def is_book(record):
    return 'book' in record.get('document_type', [])


[docs]def get_endpoint_from_record(record):
    """Return the endpoint corresponding to a record."""
    pid_type = get_pid_type_from_schema(record['$schema'])
    endpoint = get_endpoint_from_pid_type(pid_type)

    return endpoint


[docs]def get_pid_from_record_uri(record_uri):
    """Transform a URI to a record into a (pid_type, pid_value) pair."""
    parts = [part for part in record_uri.split('/') if part]
    try:
        pid_type = parts[-2][:3]
        pid_value = parts[-1]
    except IndexError:
        return None

    return pid_type, pid_value


[docs]def get_author_display_name(name):
    """Returns the display name in format Firstnames Lastnames"""
    parsed_name = ParsedName.loads(name)
    return " ".join(parsed_name.first_list + parsed_name.last_list)


[docs]def get_linked_records_in_field(record, field_path):
    """Get all linked records in a given field.

    Args:
        record (dict): the record containing the links
        field_path (string): a dotted field path specification understandable
            by ``get_value``, containing a json reference to another record.

    Returns:
        Iterator[dict]: an iterator on the linked record.

    Warning:
        Currently, the order in which the linked records are yielded is
        different from the order in which they appear in the record.

    Example:
        >>> record = {'references': [
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/literature/1234'}},
        ...     {'record': {'$ref': 'https://labs.inspirehep.net/api/data/421'}},
        ... ]}
        >>> get_linked_record_in_field(record, 'references.record')
        [...]
    """
    full_path = '.'.join([field_path, '$ref'])
    pids = force_list([get_pid_from_record_uri(rec) for rec in get_value(record, full_path, [])])
    return get_db_records(pids)


[docs]def populate_earliest_date(record):
    """Populate the ``earliest_date`` field of Literature records."""
    date_paths = [
        'preprint_date',
        'thesis_info.date',
        'thesis_info.defense_date',
        'publication_info.year',
        'legacy_creation_date',
        'imprints.date',
    ]

    dates = [
        str(el) for el in chain.from_iterable(
            [force_list(get_value(record, path)) for path in date_paths]
        )
    ]

    if dates:
        result = earliest_date(dates)
        if result:
            record['earliest_date'] = result


[docs]def populate_citations_count(record):
    """Populate citations_count in ES from"""
    if hasattr(record, 'get_citations_count'):
        # Make sure that record has method get_citations_count
        # Session is in commited state here, and I cannot open new one...
        citation_count = record.get_citations_count()
        record['citation_count'] = citation_count
    else:
        raise MissingInspireRecordError("Record is not InspireRecord!")


[docs]def populate_bookautocomplete(record):
    """Populate the ```bookautocomplete`` field of Literature records."""
    paths = [
        'imprints.date',
        'imprints.publisher',
        'isbns.value',
    ]

    authors = force_list(get_value(record, 'authors.full_name', default=[]))
    titles = force_list(get_value(record, 'titles.title', default=[]))

    input_values = list(chain.from_iterable(
        force_list(get_value(record, path, default=[])) for path in paths))
    input_values.extend(authors)
    input_values.extend(titles)
    input_values = [el for el in input_values if el]

    record['bookautocomplete'] = {
        'input': input_values,
    }


[docs]def populate_inspire_document_type(record):
    """Populate the ``facet_inspire_doc_type`` field of Literature records."""
    result = []

    result.extend(record.get('document_type', []))
    result.extend(record.get('publication_type', []))
    if 'refereed' in record and record['refereed']:
        result.append('peer reviewed')

    record['facet_inspire_doc_type'] = result


[docs]def populate_recid_from_ref(record):
    """Extract recids from all JSON reference fields and add them to ES.

    For every field that has as a value a JSON reference, adds a sibling
    after extracting the record identifier. Siblings are named by removing
    ``record`` occurrences and appending ``_recid`` without doubling or
    prepending underscores to the original name.

    Example::

        {'record': {'$ref': 'http://x/y/2}}

    is transformed to::

        {
            'recid': 2,
            'record': {'$ref': 'http://x/y/2},
        }

    For every list of object references adds a new list with the
    corresponding recids, whose name is similarly computed.

    Example::

        {
            'records': [
                {'$ref': 'http://x/y/1'},
                {'$ref': 'http://x/y/2'},
            ],
        }

    is transformed to::

        {
            'recids': [1, 2],
            'records': [
                {'$ref': 'http://x/y/1'},
                {'$ref': 'http://x/y/2'},
            ],
        }

    """
    list_ref_fields_translations = {
        'deleted_records': 'deleted_recids'
    }

    def _recursive_find_refs(json_root):
        if isinstance(json_root, list):
            items = enumerate(json_root)
        elif isinstance(json_root, dict):
            # Note that items have to be generated before altering the dict.
            # In this case, iteritems might break during iteration.
            items = json_root.items()
        else:
            items = []

        for key, value in items:
            if (isinstance(json_root, dict) and isinstance(value, dict) and '$ref' in value):
                # Append '_recid' and remove 'record' from the key name.
                key_basename = key.replace('record', '').rstrip('_')
                new_key = '{}_recid'.format(key_basename).lstrip('_')
                json_root[new_key] = get_recid_from_ref(value)
            elif (isinstance(json_root, dict) and isinstance(value, list) and
                  key in list_ref_fields_translations):
                new_list = [get_recid_from_ref(v) for v in value]
                new_key = list_ref_fields_translations[key]
                json_root[new_key] = new_list
            else:
                _recursive_find_refs(value)

    _recursive_find_refs(record)


[docs]def populate_abstract_source_suggest(record):
    """Populate the ``abstract_source_suggest`` field in Literature records."""
    abstracts = record.get('abstracts', [])

    for abstract in abstracts:
        source = abstract.get('source')
        if source:
            abstract.update({
                'abstract_source_suggest': {
                    'input': source,
                },
            })


[docs]def populate_title_suggest(record):
    """Populate the ``title_suggest`` field of Journals records."""
    journal_title = get_value(record, 'journal_title.title', default='')
    short_title = record.get('short_title', '')
    title_variants = record.get('title_variants', [])

    input_values = []
    input_values.append(journal_title)
    input_values.append(short_title)
    input_values.extend(title_variants)
    input_values = [el for el in input_values if el]

    record['title_suggest'] = {
        'input': input_values,
    }


[docs]def populate_affiliation_suggest(record):
    """Populate the ``affiliation_suggest`` field of Institution records."""
    ICN = record.get('ICN', [])
    institution_acronyms = get_value(record, 'institution_hierarchy.acronym', default=[])
    institution_names = get_value(record, 'institution_hierarchy.name', default=[])
    legacy_ICN = record.get('legacy_ICN', '')
    name_variants = force_list(get_value(record, 'name_variants.value', default=[]))
    postal_codes = force_list(get_value(record, 'addresses.postal_code', default=[]))

    # XXX: this is need by the curators to search only with numbers
    extract_numbers_from_umr = []
    for name in name_variants:
        match = re.match(r'UMR\s', name, re.IGNORECASE)
        if match:
            umr_number = name.replace(match.group(0), '')
            extract_numbers_from_umr.append(umr_number)

    input_values = []
    input_values.extend(ICN)
    input_values.extend(institution_acronyms)
    input_values.extend(institution_names)
    input_values.append(legacy_ICN)
    input_values.extend(name_variants)
    input_values.extend(postal_codes)
    input_values.extend(extract_numbers_from_umr)
    input_values = [el for el in input_values if el]

    record['affiliation_suggest'] = {
        'input': input_values,
    }


[docs]def populate_experiment_suggest(record):
    """Populates experiment_suggest field of experiment records."""

    experiment_paths = [
        'accelerator.value',
        'collaboration.value',
        'experiment.short_name',
        'experiment.value',
        'institutions.value',
        'legacy_name',
        'long_name',
        'name_variants',
    ]

    input_values = [el for el in chain.from_iterable(
        [force_list(get_value(record, path)) for path in experiment_paths]) if el]

    record['experiment_suggest'] = {
        'input': input_values,
    }


[docs]def populate_name_variations(record):
    """Generate name variations for each signature of a Literature record."""
    authors = record.get('authors', [])

    for author in authors:
        full_name = author.get('full_name')
        if full_name:
            name_variations = generate_name_variations(full_name)

            author.update({'name_variations': name_variations})
            author.update({'name_suggest': {
                'input': [variation for variation in name_variations if variation],
            }})


[docs]def populate_number_of_references(record):
    """Generate name variations for each signature of a Literature record."""
    references = record.get('references')

    if references is not None:
        record['number_of_references'] = len(references)


[docs]def populate_authors_name_variations(record):
    """Generate name variations for an Author record."""
    author_name = get_value(record, 'name.value')

    if author_name:
        name_variations = generate_name_variations(author_name)
        record['name_variations'] = name_variations


[docs]def populate_author_count(record):
    """Populate the ``author_count`` field of Literature records."""
    authors = record.get('authors', [])

    authors_excluding_supervisors = [
        author for author in authors
        if 'supervisor' not in author.get('inspire_roles', [])
    ]
    record['author_count'] = len(authors_excluding_supervisors)


[docs]def populate_authors_full_name_unicode_normalized(record):
    """Populate the ``authors.full_name_normalized`` field of Literature records."""
    authors = record.get('authors', [])

    for index, author in enumerate(authors):
        full_name = six.text_type(author['full_name'])
        record['authors'][index].update({
            'full_name_unicode_normalized': normalize('NFKC', full_name).lower()
        })


[docs]def get_author_with_record_facet_author_name(author):
    author_ids = author.get('ids', [])
    author_bai = get_values_for_schema(author_ids, 'INSPIRE BAI')
    bai = author_bai[0] if author_bai else 'BAI'
    author_preferred_name = get_value(author, 'name.preferred_name')
    if author_preferred_name:
        return u'{}_{}'.format(bai, author_preferred_name)
    else:
        return u'{}_{}'.format(bai, get_author_display_name(author['name']['value']))


[docs]def populate_facet_author_name(record):
    """Populate the ``facet_author_name`` field of Literature records."""
    authors_with_record = get_linked_records_in_field(record, 'authors.record')
    authors_without_record = [author for author in record.get('authors', []) if 'record' not in author]
    result = []

    for author in authors_with_record:
        result.append(get_author_with_record_facet_author_name(author))

    for author in authors_without_record:
        result.append(u'BAI_{}'.format(get_author_display_name(author['full_name'])))

    record['facet_author_name'] = result


[docs]def populate_author_suggest(record, *args, **kwargs):
    """Populate the ``author_suggest`` field of Authors records."""
    author_paths = [
        'name.preferred_name',
        'name.previous_names',
        'name.name_variants',
        'name.native_names',
        'name.value',
    ]

    input_values = [el for el in chain.from_iterable([force_list(get_value(record, path)) for path in author_paths])]

    record['author_suggest'] = {
        'input': input_values
    }
Source code for inspirehep.modules.records.utils

INSPIRE-HEP

Navigation

Related Topics