Source code for inspirehep.modules.authors.utils
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
"""Helper functions for authors."""
from __future__ import absolute_import, division, print_function
import re
import numpy as np
from beard.utils.strings import asciify
from beard.clustering import block_phonetic
_bai_parentheses_cleaner = \
re.compile(r"(\([^)]*\))|(\[[^\]]*\])|(\{[^\}]*\})", re.UNICODE)
_bai_last_name_separator = re.compile(r"[,;]+", re.UNICODE)
_bai_names_separator = re.compile("[,;.=\-\s]+", re.UNICODE)
_bai_special_char_mapping = {'ß': 'ss', 'ä': 'ae', 'ö': 'oe', 'ü': 'ue'}
_bai_nonletters = re.compile(r"[^\w\s]|\d", re.UNICODE)
_bai_spaces = re.compile(r"\s+", re.UNICODE)
_bai_particles = ["da", "de", "del", "den", "der",
"du", "van", "von", "het", "y"]
split_on_re = re.compile('[\.\s-]')
single_initial_re = re.compile('^\w\.$')
def _nonempty(words):
words = [w.strip() for w in words]
words = [w for w in words if len(w) >= 1]
return words
[docs]def bai(name):
# Remove content in parentheses
name = _bai_parentheses_cleaner.sub("", name)
# Get last name and initials
names = _bai_last_name_separator.split(name, maxsplit=1)
names = _nonempty(names)
if len(names) == 1:
names = _bai_spaces.split(name, maxsplit=1)
names = _nonempty(names)
if len(names) == 0:
return ""
elif len(names) == 2:
last_name = names[0]
initials = [w[0].upper()
for w in _bai_names_separator.split(names[1]) if w]
else:
last_name = names[0]
initials = []
# Asciify
for char, replacement in _bai_special_char_mapping.items():
last_name = last_name.replace(char, replacement)
initials = [i.replace(char, replacement) for i in initials]
last_name = asciify(last_name)
initials = _nonempty([asciify(i) for i in initials])
# Capitalize words in last name
words = _bai_names_separator.split(last_name)
words = _nonempty(words)
for i, w in enumerate(words):
if w.lower() in _bai_particles and i < len(words) - 1:
words[i] = w.lower()
elif (all([c.isupper() or c == "'" for c in w]) or
all([c.islower() or c == "'" for c in w])):
words[i] = w.title()
else:
words[i] = w
bai = "%s %s" % (" ".join(initials), " ".join(words))
# Keep letters and spaces
bai = _bai_nonletters.sub("", bai)
bai = bai.strip()
# Replace all spaces with .
bai = _bai_spaces.sub(".", bai)
return bai
[docs]def phonetic_blocks(full_names, phonetic_algorithm='nysiis'):
"""Create a dictionary of phonetic blocks for a given list of names."""
# The method requires a list of dictionaries with full_name as keys.
full_names_formatted = [
{"author_name": i} for i in full_names]
# Create a list of phonetic blocks.
phonetic_blocks = list(
block_phonetic(np.array(
full_names_formatted,
dtype=np.object).reshape(-1, 1),
threshold=0,
phonetic_algorithm=phonetic_algorithm
)
)
return dict(zip(full_names, phonetic_blocks))