Source code for inspirehep.modules.tools.authorlist

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.


"""Functions to parse an authorlist."""

from __future__ import absolute_import, division, print_function

import re
import six

re_emptyline = re.compile(r'\n\s*\n', re.UNICODE)
re_hyphens = re.compile(
    r'(\\255|\u02D7|\u0335|\u0336|\u2212|\u2013|\u002D|\uFE63|\uFF0D)', re.UNICODE)
re_multiple_space = re.compile(r'\s{2,}', re.UNICODE)
re_potential_key = re.compile(r"^(?:\d|[^\w.'-])+$", re.UNICODE)
re_trailing_nonword = re.compile(r"((?:\d|[^\w,.'-])+ )", re.UNICODE)
re_symbols = re.compile(r'[^\w ]', re.UNICODE)


[docs]def split_id(word): """ Separate potential aff-ids . E.g.: '*12%$' -> ['*', '12' '%', '$'] """ aff_ids = [] symbols = re_symbols.findall(word) if symbols: aff_ids += symbols for rest in re_symbols.split(word): if rest: aff_ids.append(rest) else: aff_ids.append(word) return aff_ids
[docs]def parse_authors(text, affiliations): """ Parse author names and convert to Lastname, Firstnames. Can be separated by ',', newline or affiliation tag. Returns: List of tuples: (author_fullname, [author_affiliations]) List of strings: warnings """ import copy authors = [] warnings = [] text = text.replace(',', ' , ') text = text.replace('\n', ' , ') text = text.replace(' and ', ' , ') + ' ' text = re_trailing_nonword.sub(r' \1', text) text = re_multiple_space.sub(' ', text) aff_keys = affiliations.keys() unused_aff_keys = copy.deepcopy(aff_keys) key_type = '' if aff_keys: if aff_keys[0].isalpha(): key_type = 'alpha' elif aff_keys[0].isdigit(): key_type = 'digit' else: key_type = 'symbol' warnings.append('CAUTION! Using symbols (# and stuff) as aff-IDs.') else: warnings.append('Found no affiliations (empty line needed)') author_names = [] author_affs = [] fullname = '' list_of_words = text.split(' ') for nw, word in enumerate(list_of_words): if word in aff_keys or word == ',' or re_potential_key.search(word): # author name stops here, add affiliations if author_names: fullname = ' '.join(author_names) if len(author_names) == 1: warnings.append('Author without firstname: %s' % fullname) author_names = [] if word in aff_keys: author_affs.append(affiliations[word]) if word in unused_aff_keys: unused_aff_keys.remove(word) elif not word == ',': # something left over or not separated for aff_key in split_id(word): if aff_key in aff_keys: author_affs.append(affiliations[aff_key]) if aff_key in unused_aff_keys: unused_aff_keys.remove(aff_key) else: warnings.append( 'Unresolved aff-ID or stray footnote symbol. ' 'Problematic author and aff-id: %s %s' % (fullname, aff_key) ) else: # (part of) (next) author name, process previous author if key_type == 'alpha' and word.islower() and word.isalpha(): three_words = \ list_of_words[max(nw - 2, 0):min(nw + 3, len(list_of_words)) - 1] warnings.append( 'Is this part of a name or missing aff-id? "%s" in %s' % (word, ' '.join(three_words)) ) if fullname: if affiliations and not author_affs: # there should be affiliations warnings.append( 'Author without affiliation-id. ' 'Problematic author: %s' % fullname ) authors.append((fullname, author_affs)) author_affs = [] fullname = '' if word: author_names.append(word) if author_names: fullname = ' '.join(author_names) if len(author_names) == 1: warnings.append('Author without firstname: %s' % fullname) author_affs = [] if fullname: authors.append((fullname, author_affs)) if unused_aff_keys: warnings.append('Unused affiliation-IDs: %s' % unused_aff_keys) return authors, warnings
[docs]def determine_aff_type_character(char_list): """ Guess whether affiliation are by number, letter or symbols (e.g. dagger). Numbers and letters should not be mixed. """ aff_type = None for char in char_list: if aff_type: if aff_type == 'alpha': if not char.isalpha(): return None elif aff_type == 'digit': if not char.isdigit(): return None else: if char.isalpha(): aff_type = 'alpha' elif char.isdigit(): aff_type = 'digit' else: aff_type = 'symbol' break return aff_type
[docs]def determine_aff_type(text): """ Guess format for affiliations. Return corresponding search pattern. """ line_pattern_single = {'alpha': re.compile(r'^([a-z]+)\.*$', re.UNICODE), 'digit': re.compile(r'^(\d+)\.*$', re.UNICODE), 'symbol': re.compile(r'^(.)\.*$', re.UNICODE)} line_pattern_line = {'alpha': re.compile(r'^([a-z]+)[ .]+(.*)', re.UNICODE), 'digit': re.compile(r'^(\d+)[ .]*(.*)', re.UNICODE), 'symbol': re.compile(r'^(.)[ .]+(.*)', re.UNICODE)} single_char = [] first_char = [] for line in text.split('\n'): line = line.strip(' .') if len(line) == 1: single_char.append(line) elif line: first_char.append(line[0]) if single_char: aff_type = determine_aff_type_character(single_char) if aff_type: aff_pattern = line_pattern_single[aff_type] else: raise ValueError('Cannot identify type of affiliation, ' 'found IDs: %s' % single_char) else: aff_type = determine_aff_type_character(first_char) if aff_type: aff_pattern = line_pattern_line[aff_type] else: raise ValueError('Cannot identify type of affiliations, ' 'found IDs: %s' % first_char) return aff_pattern
[docs]def parse_affiliations(text): """ Determine how affiliations are formatted. Return hash of id:affiliation Allowed formats: don't mix letters and numbers, lower-case letters only 1 CERN, Switzerland 2 DESY, Germany 1 CERN, Switzerland 2DESY, Germany a CERN, Switzerland bb DESY, Germany * CERN, Switzerland # DESY, Germany """ affiliations = {} aff_pattern = determine_aff_type(text) aff_id = None this_aff = [] for line in text.split('\n'): line = line.strip() get_affiliation = aff_pattern.search(line) if get_affiliation: if len(get_affiliation.groups()) == 2: affiliations[get_affiliation.group(1)] = \ get_affiliation.group(2).strip() else: if aff_id and this_aff: affiliations[aff_id] = ' '.join(this_aff).strip() aff_id = None this_aff = [] aff_id = get_affiliation.group(1) elif aff_id: this_aff.append(line) elif line: raise ValueError('Something is wrong with the affiliation list') if aff_id and this_aff: affiliations[aff_id] = ' '.join(this_aff).strip() return affiliations
[docs]def create_authors(text): """ Split text in (useful) blocks, sepatated by empty lines. 1 block: no affiliations 2 blocks: authors and affiliations more blocks: authors grouped by affiliation (not implemented yet) Returns: dict: with two keys: ``authors`` of the form ``(author_fullname, [author_affiliations])`` and ``warnings`` which is a list of strings. """ if not text: return {} if not isinstance(text, six.text_type): text = text.decode('utf-8') text = text.replace('\r', '') # Input from the form contains unwanted \r's text = re_hyphens.sub('-', text) empty_blocks = [] text_blocks = re_emptyline.split(text) for num, block in enumerate(text_blocks): if not re.search(r'\w', block): empty_blocks.append(num) empty_blocks.reverse() for num in empty_blocks: text_blocks.pop(num) if len(text_blocks) == 0: authors, warnings = [], [] elif len(text_blocks) == 1: authors, warnings = parse_authors(text_blocks[0], {}) elif len(text_blocks) == 2: affiliations = parse_affiliations(text_blocks[1]) authors, warnings = parse_authors(text_blocks[0], affiliations) else: # authors = parse_blocks(text_blocks) raise ValueError('Authors grouped by affiliation? - Comming soon.' 'Or too many empty lines.') if warnings: return {'authors': authors, 'warnings': warnings} else: return {'authors': authors}