Source code for inspirehep.modules.workflows.tasks.classifier

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Set of tasks for classification."""

from __future__ import absolute_import, division, print_function

from functools import wraps

from inspire_utils.record import get_value
from invenio_classifier import (
    get_keywords_from_local_file,
    get_keywords_from_text,
)
from invenio_classifier.errors import ClassifierException
from invenio_classifier.reader import KeywordToken

from ..proxies import antihep_keywords
from ..utils import with_debug_logging, get_document_in_workflow


@with_debug_logging
[docs]def filter_core_keywords(obj, eng):
    """Filter core keywords."""
    try:
        result = obj.extra_data['classifier_results']["complete_output"]
    except KeyError:
        return
    filtered_core_keywords = [
        keyword for keyword in result.get('core_keywords')
        if keyword['keyword'] not in antihep_keywords
    ]
    result["filtered_core_keywords"] = filtered_core_keywords
    obj.extra_data['classifier_results']["complete_output"] = result


[docs]def classify_paper(taxonomy=None, rebuild_cache=False, no_cache=False,
                   output_limit=20, spires=False,
                   match_mode='full', with_author_keywords=False,
                   extract_acronyms=False, only_core_tags=False,
                   fast_mode=False):
    """Extract keywords from a pdf file or metadata in a OAI harvest."""
    @with_debug_logging
    @wraps(classify_paper)
    def _classify_paper(obj, eng):
        from flask import current_app
        params = dict(
            taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'],
            output_mode='dict',
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

        fulltext_used = True
        with get_document_in_workflow(obj) as tmp_document:
            try:
                if tmp_document:
                    result = get_keywords_from_local_file(tmp_document, **params)
                else:
                    data = get_value(obj.data, 'titles.title', [])
                    data.extend(get_value(obj.data, 'titles.subtitle', []))
                    data.extend(get_value(obj.data, 'abstracts.value', []))
                    data.extend(get_value(obj.data, 'keywords.value', []))
                    if not data:
                        obj.log.error("No classification done due to missing data.")
                        return
                    result = get_keywords_from_text(data, **params)
                    fulltext_used = False
            except ClassifierException as e:
                obj.log.exception(e)
                return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {})
        )
        result["fulltext_used"] = fulltext_used

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result

    return _classify_paper


[docs]def clean_instances_from_data(output):
    """Check if specific keys are of InstanceType and replace them with their id."""
    new_output = {}
    for output_key in output.keys():
        keywords = output[output_key]
        for key in keywords:
            if isinstance(key, KeywordToken):
                keywords[key.id] = keywords.pop(key)
        new_output[output_key] = keywords
    return new_output
Source code for inspirehep.modules.workflows.tasks.classifier

INSPIRE-HEP

Navigation

Related Topics