Source code for inspirehep.modules.disambiguation.api
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction
"""Disambiguation API."""
from __future__ import absolute_import, division, print_function
import json
from collections import defaultdict
import six
from flask import current_app
from inspirehep.modules.disambiguation.core.db.readers import (
get_all_curated_signatures,
get_all_publications,
)
from inspirehep.modules.disambiguation.core.ml.models import (
DistanceEstimator,
EthnicityEstimator,
)
from inspirehep.modules.disambiguation.core.ml.sampling import sample_signature_pairs
from inspirehep.modules.disambiguation.utils import open_file_in_folder
[docs]def save_sampled_pairs():
"""Save sampled signature pairs to disk.
Save a file to disk called (by default) ``sampled_pairs.jsonl``, which
contains one line per each pair of signatures sampled from INSPIRE that
will be used by ``BEARD`` during training.
"""
with open_file_in_folder(current_app.config['DISAMBIGUATION_SAMPLED_PAIRS_PATH'], 'w') as fd:
signatures_path = current_app.config['DISAMBIGUATION_CURATED_SIGNATURES_PATH']
clusters_path = current_app.config['DISAMBIGUATION_INPUT_CLUSTERS_PATH']
pairs_size = current_app.config['DISAMBIGUATION_SAMPLED_PAIRS_SIZE']
for pair in sample_signature_pairs(signatures_path, clusters_path, pairs_size):
fd.write(json.dumps(pair) + '\n')
[docs]def save_publications():
"""Save publications to disk.
Saves a file to disk called (by default) ``publications.jsonl``, which
contains one line per record in INSPIRE with information that will be
useful for ``BEARD`` during training and prediction.
"""
with open_file_in_folder(current_app.config['DISAMBIGUATION_PUBLICATIONS_PATH'], 'w') as fd:
for publication in get_all_publications():
fd.write(json.dumps(publication) + '\n')
[docs]def train_and_save_ethnicity_model():
"""Train the ethnicity estimator model and save it to disk."""
estimator = EthnicityEstimator()
estimator.load_data(current_app.config['DISAMBIGUATION_ETHNICITY_DATA_PATH'])
estimator.fit()
estimator.save_model(current_app.config['DISAMBIGUATION_ETHNICITY_MODEL_PATH'])
[docs]def train_and_save_distance_model():
"""Train the distance estimator model and save it to disk."""
ethnicity_estimator = EthnicityEstimator()
ethnicity_estimator.load_model(current_app.config['DISAMBIGUATION_ETHNICITY_MODEL_PATH'])
distance_estimator = DistanceEstimator(ethnicity_estimator)
distance_estimator.load_data(
current_app.config['DISAMBIGUATION_CURATED_SIGNATURES_PATH'],
current_app.config['DISAMBIGUATION_SAMPLED_PAIRS_PATH'],
current_app.config['DISAMBIGUATION_SAMPLED_PAIRS_SIZE'],
current_app.config['DISAMBIGUATION_PUBLICATIONS_PATH'],
)
distance_estimator.fit()
distance_estimator.save_model(current_app.config['DISAMBIGUATION_DISTANCE_MODEL_PATH'])