Source code for inspirehep.modules.authors.utils

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Helper functions for authors."""

from __future__ import absolute_import, division, print_function

import re


import numpy as np
from beard.utils.strings import asciify
from beard.clustering import block_phonetic


_bai_parentheses_cleaner = \
    re.compile(r"(\([^)]*\))|(\[[^\]]*\])|(\{[^\}]*\})", re.UNICODE)
_bai_last_name_separator = re.compile(r"[,;]+", re.UNICODE)
_bai_names_separator = re.compile("[,;.=\-\s]+", re.UNICODE)
_bai_special_char_mapping = {'ß': 'ss', 'ä': 'ae', 'ö': 'oe', 'ü': 'ue'}
_bai_nonletters = re.compile(r"[^\w\s]|\d", re.UNICODE)
_bai_spaces = re.compile(r"\s+", re.UNICODE)
_bai_particles = ["da", "de", "del", "den", "der",
                  "du", "van", "von", "het", "y"]
split_on_re = re.compile('[\.\s-]')
single_initial_re = re.compile('^\w\.$')


def _nonempty(words):
    words = [w.strip() for w in words]
    words = [w for w in words if len(w) >= 1]
    return words


[docs]def bai(name):
    # Remove content in parentheses
    name = _bai_parentheses_cleaner.sub("", name)

    # Get last name and initials
    names = _bai_last_name_separator.split(name, maxsplit=1)
    names = _nonempty(names)

    if len(names) == 1:
        names = _bai_spaces.split(name, maxsplit=1)
        names = _nonempty(names)

    if len(names) == 0:
        return ""

    elif len(names) == 2:
        last_name = names[0]
        initials = [w[0].upper()
                    for w in _bai_names_separator.split(names[1]) if w]

    else:
        last_name = names[0]
        initials = []

    # Asciify
    for char, replacement in _bai_special_char_mapping.items():
        last_name = last_name.replace(char, replacement)
        initials = [i.replace(char, replacement) for i in initials]

    last_name = asciify(last_name)
    initials = _nonempty([asciify(i) for i in initials])

    # Capitalize words in last name
    words = _bai_names_separator.split(last_name)
    words = _nonempty(words)

    for i, w in enumerate(words):
        if w.lower() in _bai_particles and i < len(words) - 1:
            words[i] = w.lower()
        elif (all([c.isupper() or c == "'" for c in w]) or
              all([c.islower() or c == "'" for c in w])):
            words[i] = w.title()
        else:
            words[i] = w

    bai = "%s %s" % (" ".join(initials), " ".join(words))

    # Keep letters and spaces
    bai = _bai_nonletters.sub("", bai)
    bai = bai.strip()

    # Replace all spaces with .
    bai = _bai_spaces.sub(".", bai)

    return bai


[docs]def phonetic_blocks(full_names, phonetic_algorithm='nysiis'):
    """Create a dictionary of phonetic blocks for a given list of names."""

    # The method requires a list of dictionaries with full_name as keys.
    full_names_formatted = [
        {"author_name": i} for i in full_names]

    # Create a list of phonetic blocks.
    phonetic_blocks = list(
        block_phonetic(np.array(
            full_names_formatted,
            dtype=np.object).reshape(-1, 1),
            threshold=0,
            phonetic_algorithm=phonetic_algorithm
        )
    )

    return dict(zip(full_names, phonetic_blocks))
Source code for inspirehep.modules.authors.utils

INSPIRE-HEP

Navigation

Related Topics