Source code for inspirehep.modules.tools.authorlist

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.


"""Functions to parse an authorlist."""

from __future__ import absolute_import, division, print_function

import re
import six

re_emptyline = re.compile(r'\n\s*\n', re.UNICODE)
re_hyphens = re.compile(
    r'(\\255|\u02D7|\u0335|\u0336|\u2212|\u2013|\u002D|\uFE63|\uFF0D)', re.UNICODE)
re_multiple_space = re.compile(r'\s{2,}', re.UNICODE)
re_potential_key = re.compile(r"^(?:\d|[^\w.'-])+$", re.UNICODE)
re_trailing_nonword = re.compile(r"((?:\d|[^\w,.'-])+ )", re.UNICODE)
re_symbols = re.compile(r'[^\w ]', re.UNICODE)


[docs]def split_id(word):
    """
    Separate potential aff-ids .
    E.g.: '*12%$' -> ['*', '12' '%', '$']
    """

    aff_ids = []

    symbols = re_symbols.findall(word)
    if symbols:
        aff_ids += symbols
        for rest in re_symbols.split(word):
            if rest:
                aff_ids.append(rest)
    else:
        aff_ids.append(word)
    return aff_ids


[docs]def parse_authors(text, affiliations):
    """
    Parse author names and convert to Lastname, Firstnames.
    Can be separated by ',', newline or affiliation tag.
    Returns:
    List of tuples: (author_fullname, [author_affiliations])
    List of strings: warnings
    """
    import copy

    authors = []
    warnings = []

    text = text.replace(',', ' , ')
    text = text.replace('\n', ' , ')
    text = text.replace(' and ', ' , ') + ' '
    text = re_trailing_nonword.sub(r' \1', text)
    text = re_multiple_space.sub(' ', text)

    aff_keys = affiliations.keys()
    unused_aff_keys = copy.deepcopy(aff_keys)
    key_type = ''
    if aff_keys:
        if aff_keys[0].isalpha():
            key_type = 'alpha'
        elif aff_keys[0].isdigit():
            key_type = 'digit'
        else:
            key_type = 'symbol'
            warnings.append('CAUTION! Using symbols (# and stuff) as aff-IDs.')
    else:
        warnings.append('Found no affiliations (empty line needed)')

    author_names = []
    author_affs = []
    fullname = ''
    list_of_words = text.split(' ')
    for nw, word in enumerate(list_of_words):
        if word in aff_keys or word == ',' or re_potential_key.search(word):
            # author name stops here, add affiliations
            if author_names:
                fullname = ' '.join(author_names)
                if len(author_names) == 1:
                    warnings.append('Author without firstname: %s' % fullname)
                author_names = []
            if word in aff_keys:
                author_affs.append(affiliations[word])
                if word in unused_aff_keys:
                    unused_aff_keys.remove(word)
            elif not word == ',':
                # something left over or not separated
                for aff_key in split_id(word):
                    if aff_key in aff_keys:
                        author_affs.append(affiliations[aff_key])
                        if aff_key in unused_aff_keys:
                            unused_aff_keys.remove(aff_key)
                    else:
                        warnings.append(
                            'Unresolved aff-ID or stray footnote symbol. '
                            'Problematic author and aff-id: %s %s' %
                            (fullname, aff_key)
                        )
        else:
            # (part of) (next) author name, process previous author
            if key_type == 'alpha' and word.islower() and word.isalpha():
                three_words = \
                    list_of_words[max(nw - 2, 0):min(nw + 3, len(list_of_words)) - 1]
                warnings.append(
                    'Is this part of a name or missing aff-id? "%s" in %s' %
                    (word, ' '.join(three_words))
                )
            if fullname:
                if affiliations and not author_affs:
                    # there should be affiliations
                    warnings.append(
                        'Author without affiliation-id. '
                        'Problematic author: %s' % fullname
                    )

                authors.append((fullname, author_affs))
                author_affs = []
                fullname = ''
            if word:
                author_names.append(word)

    if author_names:
        fullname = ' '.join(author_names)
        if len(author_names) == 1:
            warnings.append('Author without firstname: %s' % fullname)
        author_affs = []
    if fullname:
        authors.append((fullname, author_affs))

    if unused_aff_keys:
        warnings.append('Unused affiliation-IDs: %s' % unused_aff_keys)

    return authors, warnings


[docs]def determine_aff_type_character(char_list):
    """
    Guess whether affiliation are by number, letter or symbols (e.g. dagger).
    Numbers and letters should not be mixed.
    """

    aff_type = None
    for char in char_list:
        if aff_type:
            if aff_type == 'alpha':
                if not char.isalpha():
                    return None
            elif aff_type == 'digit':
                if not char.isdigit():
                    return None
        else:
            if char.isalpha():
                aff_type = 'alpha'
            elif char.isdigit():
                aff_type = 'digit'
            else:
                aff_type = 'symbol'
                break
    return aff_type


[docs]def determine_aff_type(text):
    """
    Guess format for affiliations.
    Return corresponding search pattern.
    """

    line_pattern_single = {'alpha': re.compile(r'^([a-z]+)\.*$', re.UNICODE),
                           'digit': re.compile(r'^(\d+)\.*$', re.UNICODE),
                           'symbol': re.compile(r'^(.)\.*$', re.UNICODE)}

    line_pattern_line = {'alpha': re.compile(r'^([a-z]+)[ .]+(.*)', re.UNICODE),
                         'digit': re.compile(r'^(\d+)[ .]*(.*)', re.UNICODE),
                         'symbol': re.compile(r'^(.)[ .]+(.*)', re.UNICODE)}

    single_char = []
    first_char = []
    for line in text.split('\n'):
        line = line.strip(' .')
        if len(line) == 1:
            single_char.append(line)
        elif line:
            first_char.append(line[0])

    if single_char:
        aff_type = determine_aff_type_character(single_char)
        if aff_type:
            aff_pattern = line_pattern_single[aff_type]
        else:
            raise ValueError('Cannot identify type of affiliation, '
                             'found IDs: %s' % single_char)
    else:
        aff_type = determine_aff_type_character(first_char)
        if aff_type:
            aff_pattern = line_pattern_line[aff_type]
        else:
            raise ValueError('Cannot identify type of affiliations, '
                             'found IDs: %s' % first_char)

    return aff_pattern


[docs]def parse_affiliations(text):
    """
    Determine how affiliations are formatted.
    Return hash of id:affiliation

    Allowed formats:
    don't mix letters and numbers, lower-case letters only

    1
    CERN, Switzerland
    2
    DESY,
    Germany


    1 CERN, Switzerland
    2DESY, Germany

    a  CERN, Switzerland
    bb DESY, Germany

    *
    CERN, Switzerland
    #
    DESY, Germany
    """

    affiliations = {}
    aff_pattern = determine_aff_type(text)

    aff_id = None
    this_aff = []
    for line in text.split('\n'):
        line = line.strip()
        get_affiliation = aff_pattern.search(line)
        if get_affiliation:
            if len(get_affiliation.groups()) == 2:
                affiliations[get_affiliation.group(1)] = \
                    get_affiliation.group(2).strip()
            else:
                if aff_id and this_aff:
                    affiliations[aff_id] = ' '.join(this_aff).strip()
                    aff_id = None
                    this_aff = []
                aff_id = get_affiliation.group(1)
        elif aff_id:
            this_aff.append(line)
        elif line:
            raise ValueError('Something is wrong with the affiliation list')
    if aff_id and this_aff:
        affiliations[aff_id] = ' '.join(this_aff).strip()

    return affiliations


[docs]def create_authors(text):
    """
    Split text in (useful) blocks, sepatated by empty lines.
    1 block: no affiliations
    2 blocks: authors and affiliations
    more blocks: authors grouped by affiliation (not implemented yet)

    Returns:
        dict: with two keys: ``authors`` of the form ``(author_fullname,
        [author_affiliations])`` and ``warnings`` which is a list of strings.
    """

    if not text:
        return {}

    if not isinstance(text, six.text_type):
        text = text.decode('utf-8')
    text = text.replace('\r', '')  # Input from the form contains unwanted \r's
    text = re_hyphens.sub('-', text)

    empty_blocks = []
    text_blocks = re_emptyline.split(text)
    for num, block in enumerate(text_blocks):
        if not re.search(r'\w', block):
            empty_blocks.append(num)
    empty_blocks.reverse()
    for num in empty_blocks:
        text_blocks.pop(num)

    if len(text_blocks) == 0:
        authors, warnings = [], []
    elif len(text_blocks) == 1:
        authors, warnings = parse_authors(text_blocks[0], {})
    elif len(text_blocks) == 2:
        affiliations = parse_affiliations(text_blocks[1])
        authors, warnings = parse_authors(text_blocks[0], affiliations)
    else:
        # authors = parse_blocks(text_blocks)
        raise ValueError('Authors grouped by affiliation? - Comming soon.'
                         'Or too many empty lines.')

    if warnings:
        return {'authors': authors, 'warnings': warnings}
    else:
        return {'authors': authors}
Source code for inspirehep.modules.tools.authorlist

INSPIRE-HEP

Navigation

Related Topics