# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
"""Disambiguation core ML models."""
from __future__ import absolute_import, division, print_function
import csv
import json
import pickle
import numpy as np
from scipy.special import expit
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.svm import LinearSVC
from beard.similarity import (
CosineSimilarity,
ElementMultiplication,
EstimatorTransformer,
PairTransformer,
StringDistance,
)
from beard.utils import (
FuncTransformer,
Shaper,
given_name,
given_name_initial,
normalize_name,
)
from inspire_utils.record import get_value
from inspirehep.modules.disambiguation.utils import open_file_in_folder
[docs]class EthnicityEstimator(object):
def __init__(self, C=4.0):
self.C = C
[docs] def load_data(self, input_filename):
ethnicities, lasts, firsts = [], [], []
with open(input_filename, 'r') as fd:
reader = csv.DictReader(fd)
for row in reader:
ethnicities.append(int(row['RACE']))
lasts.append(row['NAMELAST'])
firsts.append(row['NAMEFRST'])
names = ['%s, %s' % (last, first) for last, first in zip(lasts, firsts)]
normalized_names = [normalize_name(name) for name in names]
self.X = normalized_names
self.y = ethnicities
[docs] def load_model(self, input_filename):
with open(input_filename, 'r') as fd:
self.estimator = pickle.load(fd)
[docs] def save_model(self, output_filename):
with open_file_in_folder(output_filename, 'w') as fd:
pickle.dump(self.estimator, fd, protocol=pickle.HIGHEST_PROTOCOL)
[docs] def fit(self):
self.estimator = Pipeline([
('transformer', TfidfVectorizer(
analyzer='char_wb',
ngram_range=(1, 5),
min_df=0.00005,
dtype=np.float32,
decode_error='replace',
)),
('classifier', LinearSVC(C=self.C)),
])
self.estimator.fit(self.X, self.y)
[docs] def predict(self, X):
return self.estimator.predict(X)
[docs]class DistanceEstimator(object):
def __init__(self, ethnicity_estimator):
self.ethnicity_estimator = ethnicity_estimator
[docs] def load_data(self, signatures_path, pairs_path, pairs_size, publications_path):
publications_by_id = {}
with open(publications_path, 'r') as fd:
for line in fd:
publication = json.loads(line)
publications_by_id[publication['publication_id']] = publication
signatures_by_uuid = {}
with open(signatures_path, 'r') as fd:
for line in fd:
signature = json.loads(line)
signature['publication'] = publications_by_id[signature['publication_id']]
signatures_by_uuid[signature['signature_uuid']] = signature
self.X = np.empty((pairs_size, 2), dtype=np.object)
self.y = np.empty(pairs_size, dtype=np.int)
with open(pairs_path, 'r') as fd:
for i, line in enumerate(fd):
pair = json.loads(line)
self.X[i, 0] = signatures_by_uuid[pair['signature_uuids'][0]]
self.X[i, 1] = signatures_by_uuid[pair['signature_uuids'][1]]
self.y[i] = 0 if pair['same_cluster'] else 1
[docs] def load_model(self, input_filename):
with open(input_filename, 'r') as fd:
self.distance_estimator = pickle.load(fd)
[docs] def save_model(self, output_filename):
with open_file_in_folder(output_filename, 'w') as fd:
pickle.dump(self.distance_estimator, fd, protocol=pickle.HIGHEST_PROTOCOL)
[docs] def fit(self):
transformer = FeatureUnion([
('author_full_name_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('full_name', FuncTransformer(func=get_author_full_name)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
analyzer='char_wb',
ngram_range=(2, 4),
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('author_second_initial_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=FuncTransformer(func=get_second_initial),
groupby=group_by_signature,
)),
('combiner', StringDistance(similarity_function='character_equality')),
])),
('author_first_given_name_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=FuncTransformer(func=get_first_given_name),
groupby=group_by_signature
)),
('combiner', StringDistance()),
])),
('author_second_given_name_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=FuncTransformer(func=get_second_given_name),
groupby=group_by_signature,
)),
('combiner', StringDistance()),
])),
('author_other_names_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('other_names', FuncTransformer(func=get_author_other_names)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
analyzer='char_wb',
ngram_range=(2, 4),
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('affiliation_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('affiliation', FuncTransformer(func=get_author_affiliation)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
analyzer='char_wb',
ngram_range=(2, 4),
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('coauthors_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('coauthors', FuncTransformer(func=get_coauthors_neighborhood)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('abstract_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('abstract', FuncTransformer(func=get_abstract)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('keywords_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('keywords', FuncTransformer(func=get_keywords)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('collaborations_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('collaborations', FuncTransformer(func=get_collaborations)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('subject_similairty', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('keywords', FuncTransformer(func=get_topics)),
('shaper', Shaper(newshape=(-1))),
('tf-idf', TfidfVectorizer(
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('title_similarity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('title', FuncTransformer(func=get_title)),
('shaper', Shaper(newshape=(-1,))),
('tf-idf', TfidfVectorizer(
analyzer='char_wb',
ngram_range=(2, 4),
dtype=np.float32,
decode_error='replace',
)),
]),
groupby=group_by_signature,
)),
('combiner', CosineSimilarity()),
])),
('author_ethnicity', Pipeline([
('pairs', PairTransformer(
element_transformer=Pipeline([
('name', FuncTransformer(func=get_author_full_name)),
('shaper', Shaper(newshape=(-1,))),
('classifier', EstimatorTransformer(self.ethnicity_estimator.estimator)),
]),
groupby=group_by_signature,
)),
('sigmoid', FuncTransformer(func=expit)),
('combiner', ElementMultiplication()),
])),
])
classifier = RandomForestClassifier(n_estimators=500, n_jobs=8)
self.distance_estimator = Pipeline([('transformer', transformer), ('classifier', classifier)])
self.distance_estimator.fit(self.X, self.y)
[docs]def get_author_full_name(signature):
return normalize_name(signature['author_name'])
[docs]def get_first_initial(signature):
try:
return given_name_initial(signature['author_name'], 0)
except IndexError:
return ''
[docs]def get_second_initial(signature):
try:
return given_name_initial(signature['author_name'], 1)
except IndexError:
return ''
[docs]def get_first_given_name(signature):
return given_name(signature['author_name'], 0)
[docs]def get_second_given_name(signature):
return given_name(signature['author_name'], 1)
[docs]def get_author_other_names(signature):
author_name = signature['author_name']
other_names = author_name.split(',', 1)
return normalize_name(other_names[1]) if len(other_names) == 2 else ''
[docs]def get_author_affiliation(signature):
author_affiliation = signature['author_affiliation']
return normalize_name(author_affiliation) if author_affiliation else ''
[docs]def get_coauthors_neighborhood(signature, radius=10):
authors = get_value(signature, 'publication.authors', default=[])
try:
center = authors.index(signature['author_name'])
return ' '.join(authors[max(0, center - radius):min(len(authors), center + radius)])
except ValueError:
return ' '.join(authors)
[docs]def get_abstract(signature):
return get_value(signature, 'publication.abstract', default='')
[docs]def get_keywords(signature):
return ' '.join(get_value(signature, 'publication.keywords', default=[]))
[docs]def get_collaborations(signature):
return ' '.join(get_value(signature, 'publication.collaborations', default=[]))
[docs]def get_topics(signature):
return ' '.join(get_value(signature, 'publication.topics', default=[]))
[docs]def get_title(signature):
return get_value(signature, 'publication.title', default='')
[docs]def group_by_signature(signatures):
return signatures[0]['signature_uuid']