Source code for inspirehep.modules.workflows.tasks.magpie
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
"""Set of workflow tasks for MagPie API."""
from __future__ import absolute_import, division, print_function
import requests
from flask import current_app
from inspire_utils.record import get_value
from inspirehep.modules.workflows.utils import json_api_request
from ..utils import with_debug_logging
[docs]def get_magpie_url():
"""Return the Magpie URL endpoint, if any."""
base_url = current_app.config.get("MAGPIE_API_URL")
if not base_url:
return
return "{base_url}/predict".format(
base_url=base_url
)
[docs]def prepare_magpie_payload(record, corpus):
"""Prepare payload to send to Magpie API."""
payload = dict(text="", corpus=corpus)
titles = filter(None, get_value(record, "titles.title", []))
abstracts = filter(None, get_value(record, "abstracts.value", []))
payload["text"] = ". ".join(
[part.encode('utf-8') for part in titles + abstracts])
return payload
[docs]def filter_magpie_response(labels, limit):
"""Filter response from Magpie API, keeping most relevant labels."""
filtered_labels = [
(word, score) for word, score in labels
if score >= limit
]
# In the event that there are no labels with a high enough score,
# we take only the top one
if labels and len(filtered_labels) == 0:
filtered_labels.append(labels[0])
return filtered_labels
@with_debug_logging
[docs]def guess_keywords(obj, eng):
"""Workflow task to ask Magpie API for a keywords assessment."""
magpie_url = get_magpie_url()
if not magpie_url:
# Skip task if no API URL set
return
payload = prepare_magpie_payload(obj.data, corpus="keywords")
try:
results = json_api_request(magpie_url, payload)
except requests.exceptions.RequestException:
results = {}
if results:
labels = results.get('labels', [])
keywords = labels[:10]
keywords = [{'label': k[0], 'score': k[1], 'accept': k[1] >= 0.09} for
k in
keywords]
obj.extra_data["keywords_prediction"] = dict(
keywords=keywords
)
current_app.logger.info("Keyword prediction (top 10): {0}".format(
obj.extra_data["keywords_prediction"]["keywords"]
))
@with_debug_logging
[docs]def guess_categories(obj, eng):
"""Workflow task to ask Magpie API for a subject area assessment."""
magpie_url = get_magpie_url()
if not magpie_url:
# Skip task if no API URL set
return
payload = prepare_magpie_payload(obj.data, corpus="categories")
results = json_api_request(magpie_url, payload)
if results:
labels = results.get('labels', [])
categories = filter_magpie_response(labels, limit=0.22)
categories = [{'label': c[0], 'score': c[1],
'accept': c[1] >= 0.25} for c in categories]
obj.extra_data["categories_prediction"] = dict(
categories=categories
)
current_app.logger.info("Category prediction: {0}".format(
obj.extra_data["categories_prediction"]["categories"]
))
@with_debug_logging
[docs]def guess_experiments(obj, eng):
"""Workflow task to ask Magpie API for a subject area assessment."""
magpie_url = get_magpie_url()
if not magpie_url:
# Skip task if no API URL set
return
payload = prepare_magpie_payload(obj.data, corpus="experiments")
results = json_api_request(magpie_url, payload)
if results:
all_predictions = results.get('labels', [])
selected_experiments = filter_magpie_response(
all_predictions,
limit=0.5,
)
selected_experiments = [
{'label': e[0], 'score': e[1]}
for e in selected_experiments
]
obj.extra_data["experiments_prediction"] = dict(
experiments=selected_experiments,
)
current_app.logger.info("Experiment prediction: {0}".format(
obj.extra_data["experiments_prediction"]["experiments"]
))