Source code for inspirehep.modules.workflows.tasks.magpie

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Set of workflow tasks for MagPie API."""

from __future__ import absolute_import, division, print_function

import requests

from flask import current_app

from inspire_utils.record import get_value
from inspirehep.modules.workflows.utils import json_api_request

from ..utils import with_debug_logging


[docs]def get_magpie_url():
    """Return the Magpie URL endpoint, if any."""
    base_url = current_app.config.get("MAGPIE_API_URL")
    if not base_url:
        return
    return "{base_url}/predict".format(
        base_url=base_url
    )


[docs]def prepare_magpie_payload(record, corpus):
    """Prepare payload to send to Magpie API."""
    payload = dict(text="", corpus=corpus)
    titles = filter(None, get_value(record, "titles.title", []))
    abstracts = filter(None, get_value(record, "abstracts.value", []))
    payload["text"] = ". ".join(
        [part.encode('utf-8') for part in titles + abstracts])
    return payload


[docs]def filter_magpie_response(labels, limit):
    """Filter response from Magpie API, keeping most relevant labels."""
    filtered_labels = [
        (word, score) for word, score in labels
        if score >= limit
    ]

    # In the event that there are no labels with a high enough score,
    # we take only the top one
    if labels and len(filtered_labels) == 0:
        filtered_labels.append(labels[0])
    return filtered_labels


@with_debug_logging
[docs]def guess_keywords(obj, eng):
    """Workflow task to ask Magpie API for a keywords assessment."""
    magpie_url = get_magpie_url()
    if not magpie_url:
        # Skip task if no API URL set
        return
    payload = prepare_magpie_payload(obj.data, corpus="keywords")
    try:
        results = json_api_request(magpie_url, payload)
    except requests.exceptions.RequestException:
        results = {}

    if results:
        labels = results.get('labels', [])
        keywords = labels[:10]

        keywords = [{'label': k[0], 'score': k[1], 'accept': k[1] >= 0.09} for
                    k in
                    keywords]
        obj.extra_data["keywords_prediction"] = dict(
            keywords=keywords
        )
        current_app.logger.info("Keyword prediction (top 10): {0}".format(
            obj.extra_data["keywords_prediction"]["keywords"]
        ))


@with_debug_logging
[docs]def guess_categories(obj, eng):
    """Workflow task to ask Magpie API for a subject area assessment."""
    magpie_url = get_magpie_url()
    if not magpie_url:
        # Skip task if no API URL set
        return
    payload = prepare_magpie_payload(obj.data, corpus="categories")
    results = json_api_request(magpie_url, payload)
    if results:
        labels = results.get('labels', [])
        categories = filter_magpie_response(labels, limit=0.22)

        categories = [{'label': c[0], 'score': c[1],
                       'accept': c[1] >= 0.25} for c in categories]

        obj.extra_data["categories_prediction"] = dict(
            categories=categories
        )
        current_app.logger.info("Category prediction: {0}".format(
            obj.extra_data["categories_prediction"]["categories"]
        ))


@with_debug_logging
[docs]def guess_experiments(obj, eng):
    """Workflow task to ask Magpie API for a subject area assessment."""
    magpie_url = get_magpie_url()
    if not magpie_url:
        # Skip task if no API URL set
        return

    payload = prepare_magpie_payload(obj.data, corpus="experiments")
    results = json_api_request(magpie_url, payload)
    if results:
        all_predictions = results.get('labels', [])
        selected_experiments = filter_magpie_response(
            all_predictions,
            limit=0.5,
        )
        selected_experiments = [
            {'label': e[0], 'score': e[1]}
            for e in selected_experiments
        ]
        obj.extra_data["experiments_prediction"] = dict(
            experiments=selected_experiments,
        )
        current_app.logger.info("Experiment prediction: {0}".format(
            obj.extra_data["experiments_prediction"]["experiments"]
        ))
Source code for inspirehep.modules.workflows.tasks.magpie

INSPIRE-HEP

Navigation

Related Topics