Source code for inspirehep.modules.workflows.tasks.classifier
# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
"""Set of tasks for classification."""
from __future__ import absolute_import, division, print_function
from functools import wraps
from inspire_utils.record import get_value
from invenio_classifier import (
get_keywords_from_local_file,
get_keywords_from_text,
)
from invenio_classifier.errors import ClassifierException
from invenio_classifier.reader import KeywordToken
from ..proxies import antihep_keywords
from ..utils import with_debug_logging, get_document_in_workflow
@with_debug_logging
[docs]def filter_core_keywords(obj, eng):
"""Filter core keywords."""
try:
result = obj.extra_data['classifier_results']["complete_output"]
except KeyError:
return
filtered_core_keywords = [
keyword for keyword in result.get('core_keywords')
if keyword['keyword'] not in antihep_keywords
]
result["filtered_core_keywords"] = filtered_core_keywords
obj.extra_data['classifier_results']["complete_output"] = result
[docs]def classify_paper(taxonomy=None, rebuild_cache=False, no_cache=False,
output_limit=20, spires=False,
match_mode='full', with_author_keywords=False,
extract_acronyms=False, only_core_tags=False,
fast_mode=False):
"""Extract keywords from a pdf file or metadata in a OAI harvest."""
@with_debug_logging
@wraps(classify_paper)
def _classify_paper(obj, eng):
from flask import current_app
params = dict(
taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'],
output_mode='dict',
output_limit=output_limit,
spires=spires,
match_mode=match_mode,
no_cache=no_cache,
with_author_keywords=with_author_keywords,
rebuild_cache=rebuild_cache,
only_core_tags=only_core_tags,
extract_acronyms=extract_acronyms
)
fulltext_used = True
with get_document_in_workflow(obj) as tmp_document:
try:
if tmp_document:
result = get_keywords_from_local_file(tmp_document, **params)
else:
data = get_value(obj.data, 'titles.title', [])
data.extend(get_value(obj.data, 'titles.subtitle', []))
data.extend(get_value(obj.data, 'abstracts.value', []))
data.extend(get_value(obj.data, 'keywords.value', []))
if not data:
obj.log.error("No classification done due to missing data.")
return
result = get_keywords_from_text(data, **params)
fulltext_used = False
except ClassifierException as e:
obj.log.exception(e)
return
result['complete_output'] = clean_instances_from_data(
result.get("complete_output", {})
)
result["fulltext_used"] = fulltext_used
# Check if it is not empty output before adding
if any(result.get("complete_output", {}).values()):
obj.extra_data['classifier_results'] = result
return _classify_paper
[docs]def clean_instances_from_data(output):
"""Check if specific keys are of InstanceType and replace them with their id."""
new_output = {}
for output_key in output.keys():
keywords = output[output_key]
for key in keywords:
if isinstance(key, KeywordToken):
keywords[key.id] = keywords.pop(key)
new_output[output_key] = keywords
return new_output