# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
"""Workflow for processing single arXiv records harvested."""
from __future__ import absolute_import, division, print_function
from workflow.patterns.controlflow import (
IF,
IF_NOT,
IF_ELSE,
)
from inspirehep.modules.workflows.tasks.refextract import extract_journal_info
from inspirehep.modules.workflows.tasks.arxiv import (
arxiv_author_list,
arxiv_package_download,
arxiv_plot_extract,
arxiv_derive_inspire_categories,
populate_arxiv_document,
)
from inspirehep.modules.workflows.tasks.actions import (
add_core,
count_reference_coreness,
download_documents,
error_workflow,
fix_submission_number,
halt_record,
is_arxiv_paper,
is_experimental_paper,
is_marked,
is_record_accepted,
is_record_relevant,
is_submission,
load_from_source_data,
mark,
normalize_journal_titles,
populate_journal_coverage,
populate_submission_document,
preserve_root,
refextract,
reject_record,
save_workflow,
set_refereed_and_fix_document_type,
validate_record,
jlab_ticket_needed,
)
from inspirehep.modules.workflows.tasks.classifier import (
classify_paper,
filter_core_keywords,
)
from inspirehep.modules.workflows.tasks.beard import guess_coreness
from inspirehep.modules.workflows.tasks.magpie import (
guess_keywords,
guess_categories,
guess_experiments,
)
from inspirehep.modules.workflows.tasks.matching import (
stop_processing,
raise_if_match_wf_in_error_or_initial,
match_non_completed_wf_in_holdingpen,
match_previously_rejected_wf_in_holdingpen,
exact_match,
fuzzy_match,
is_fuzzy_match_approved,
set_exact_match_as_approved_in_extradata,
set_fuzzy_match_approved_in_extradata,
has_same_source,
stop_matched_holdingpen_wfs,
auto_approve,
set_core_in_extra_data,
has_more_than_one_exact_match,
)
from inspirehep.modules.workflows.tasks.merging import (
has_conflicts,
merge_articles,
)
from inspirehep.modules.workflows.tasks.upload import (
set_schema,
store_record,
store_root,
)
from inspirehep.modules.workflows.tasks.submission import (
close_ticket,
create_ticket,
filter_keywords,
prepare_keywords,
reply_ticket,
send_to_legacy,
wait_webcoll,
)
from inspirehep.modules.workflows.utils import do_not_repeat
from inspirehep.modules.literaturesuggest.tasks import (
curation_ticket_needed,
reply_ticket_context,
new_ticket_context,
curation_ticket_context,
)
NOTIFY_SUBMISSION = [
do_not_repeat('create_ticket_curator_new_submission')(
create_ticket(
template="literaturesuggest/tickets/curator_submitted.html",
queue="HEP_add_user",
context_factory=new_ticket_context,
ticket_id_key="ticket_id"
),
),
do_not_repeat('reply_ticket_user_new_submission')(
reply_ticket(
template="literaturesuggest/tickets/user_submitted.html",
context_factory=reply_ticket_context,
keep_new=True
),
)
]
CHECK_AUTO_APPROVE = [
IF_ELSE(
is_submission,
mark('auto-approved', False),
IF_ELSE(
auto_approve,
[
mark('auto-approved', True),
set_core_in_extra_data,
],
mark('auto-approved', False),
),
),
]
ENHANCE_RECORD = [
IF(
is_arxiv_paper,
[
populate_arxiv_document,
arxiv_package_download,
arxiv_plot_extract,
arxiv_derive_inspire_categories,
arxiv_author_list("authorlist2marcxml.xsl"),
]
),
IF(
is_submission,
populate_submission_document,
),
download_documents,
normalize_journal_titles,
refextract,
count_reference_coreness,
extract_journal_info,
populate_journal_coverage,
classify_paper(
only_core_tags=False,
spires=True,
with_author_keywords=True,
),
filter_core_keywords,
guess_categories,
IF(
is_experimental_paper,
guess_experiments,
),
guess_keywords,
guess_coreness,
]
NOTIFY_NOT_ACCEPTED = [
IF(
is_submission,
do_not_repeat('reply_ticket_submission_not_accepted')(
reply_ticket(context_factory=reply_ticket_context),
),
)
]
NOTIFY_ALREADY_EXISTING = [
reject_record('Article was already found on INSPIRE'),
mark('approved', False),
do_not_repeat('reply_ticket_user_submission_already_in_inspire')(
reply_ticket(
template=(
"literaturesuggest/tickets/"
"user_rejected_exists.html"
),
context_factory=reply_ticket_context
),
),
do_not_repeat('close_ticket_user_submission_already_in_inspire')(
close_ticket(ticket_id_key="ticket_id")
),
save_workflow,
stop_processing,
]
NOTIFY_ACCEPTED = [
IF(
is_submission,
do_not_repeat('reply_ticket_user_submission_accepted')(
reply_ticket(
template='literaturesuggest/tickets/user_accepted.html',
context_factory=reply_ticket_context,
),
),
),
]
NOTIFY_CURATOR_IF_NEEDED = [
IF_NOT(
is_marked('is-update'),
[
IF_ELSE(
jlab_ticket_needed,
do_not_repeat('create_ticket_jlab_curation')(
create_ticket(
template='literaturesuggest/tickets/curation_jlab.html',
queue='HEP_curation_jlab',
context_factory=curation_ticket_context,
ticket_id_key='curation_ticket_id',
),
),
IF(
curation_ticket_needed,
do_not_repeat('create_ticket_curator_core_curation')(
create_ticket(
template='literaturesuggest/tickets/curation_core.html',
queue='HEP_curation',
context_factory=curation_ticket_context,
ticket_id_key='curation_ticket_id',
),
),
),
)
]
),
]
POSTENHANCE_RECORD = [
add_core,
filter_keywords,
prepare_keywords,
set_refereed_and_fix_document_type,
fix_submission_number,
validate_record('hep')
]
SEND_TO_LEGACY = [
send_to_legacy,
]
WAIT_FOR_LEGACY_WEBCOLL = [
IF_NOT(
is_marked('is-update'),
wait_webcoll,
),
]
STOP_IF_EXISTING_SUBMISSION = [
IF(
is_submission,
IF(
is_marked('is-update'),
NOTIFY_ALREADY_EXISTING
)
)
]
HALT_FOR_APPROVAL_IF_NEW_OR_STOP_IF_NOT_RELEVANT = [
preserve_root,
IF_ELSE(
is_marked('is-update'),
[
merge_articles,
IF(
has_conflicts,
halt_record(
action='merge_approval',
message='Submission halted for merging conflicts.'
),
),
mark('approved', True),
mark('merged', True),
],
IF_ELSE(
is_marked('auto-approved'),
mark('approved', True),
[
IF_NOT(
is_record_relevant,
[
reject_record('Article automatically rejected'),
mark('approved', False),
save_workflow,
stop_processing,
],
),
halt_record(
action="hep_approval",
message="Submission halted for curator approval.",
)
]
),
),
]
STORE_RECORD = [
store_record,
store_root,
]
MARK_IF_MATCH_IN_HOLDINGPEN = [
raise_if_match_wf_in_error_or_initial,
IF_ELSE(
match_non_completed_wf_in_holdingpen,
[
mark('already-in-holding-pen', True),
save_workflow,
],
mark('already-in-holding-pen', False),
),
IF_ELSE(
match_previously_rejected_wf_in_holdingpen,
[
mark('previously_rejected', True),
save_workflow,
],
mark('previously_rejected', False),
)
]
ERROR_WITH_UNEXPECTED_WORKFLOW_PATH = [
mark('unexpected-workflow-path', True),
error_workflow('Unexpected workflow path.'),
save_workflow,
]
# Currently we handle harvests as if all were arxiv, that will have to change.
PROCESS_HOLDINGPEN_MATCH_HARVEST = [
IF_NOT(
is_marked('is-update'),
IF(
is_marked('previously_rejected'),
IF_NOT(
is_marked('auto-approved'),
IF(
has_same_source('previously_rejected_matches'),
[
mark('approved', False), # auto-reject
save_workflow,
stop_processing,
],
)
),
),
),
IF_ELSE(
is_marked('already-in-holding-pen'),
IF_ELSE(
has_same_source('holdingpen_matches'),
# stop the matched wf and continue this one
[
stop_matched_holdingpen_wfs,
mark('stopped-matched-holdingpen-wf', True),
],
[
# else, it's an update from another source
# keep the old one
mark('stopped-matched-holdingpen-wf', False),
save_workflow,
stop_processing
],
),
mark('stopped-matched-holdingpen-wf', False),
),
save_workflow,
]
PROCESS_HOLDINGPEN_MATCH_SUBMISSION = [
IF(
is_marked('already-in-holding-pen'),
IF_ELSE(
has_same_source('holdingpen_matches'),
# form should detect this double submission
ERROR_WITH_UNEXPECTED_WORKFLOW_PATH,
# stop the matched wf and continue this one
[
stop_matched_holdingpen_wfs,
mark('stopped-matched-holdingpen-wf', True),
save_workflow
],
)
)
]
PROCESS_HOLDINGPEN_MATCHES = [
IF_ELSE(
is_submission,
PROCESS_HOLDINGPEN_MATCH_SUBMISSION,
PROCESS_HOLDINGPEN_MATCH_HARVEST,
)
]
CHECK_IS_UPDATE = [
IF_ELSE(
exact_match,
[
set_exact_match_as_approved_in_extradata,
mark('is-update', True),
mark('exact-matched', True),
IF(
has_more_than_one_exact_match,
halt_record(
action="resolve_multiple_exact_matches",
message="Workflow halted for resolving multiple exact matches.",
)
),
],
IF_ELSE(
fuzzy_match,
[
halt_record(
action="match_approval",
message="Halted for matching approval.",
),
IF_ELSE(
is_fuzzy_match_approved,
[
set_fuzzy_match_approved_in_extradata,
mark('fuzzy-matched', True),
mark('is-update', True),
],
mark('is-update', False),
)
],
mark('is-update', False),
)
),
save_workflow,
]
NOTIFY_IF_SUBMISSION = [
IF(
is_submission,
NOTIFY_SUBMISSION,
)
]
INIT_MARKS = [
mark('auto-approved', None),
mark('already-in-holding-pen', None),
mark('previously_rejected', None),
mark('is-update', None),
mark('stopped-matched-holdingpen-wf', None),
mark('approved', None),
mark('unexpected-workflow-path', None),
save_workflow
]
PRE_PROCESSING = [
load_from_source_data,
# Make sure schema is set for proper indexing in Holding Pen
set_schema,
INIT_MARKS,
validate_record('hep')
]
[docs]class Article(object):
"""Article ingestion workflow for Literature collection."""
name = "HEP"
data_type = "hep"
workflow = (
PRE_PROCESSING +
NOTIFY_IF_SUBMISSION +
MARK_IF_MATCH_IN_HOLDINGPEN +
CHECK_IS_UPDATE +
STOP_IF_EXISTING_SUBMISSION +
CHECK_AUTO_APPROVE +
PROCESS_HOLDINGPEN_MATCHES +
ENHANCE_RECORD +
HALT_FOR_APPROVAL_IF_NEW_OR_STOP_IF_NOT_RELEVANT +
[
IF_ELSE(
is_record_accepted,
(
POSTENHANCE_RECORD +
STORE_RECORD +
SEND_TO_LEGACY +
WAIT_FOR_LEGACY_WEBCOLL +
NOTIFY_ACCEPTED +
NOTIFY_CURATOR_IF_NEEDED
),
NOTIFY_NOT_ACCEPTED,
),
IF(
is_submission,
do_not_repeat('close_ticket_user_submission')(
close_ticket(ticket_id_key="ticket_id")
),
)
]
)