Source code for inspirehep.modules.refextract.tasks

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Refextract tasks."""

from __future__ import absolute_import, division, print_function

from celery import shared_task
from flask import current_app
from invenio_db import db

from inspirehep.modules.refextract.utils import KbWriter


@shared_task()
[docs]def create_journal_kb_file(): """Populate refextracts's journal KB from the database. Uses two raw DB queries that use syntax specific to PostgreSQL to generate a file in the format that refextract expects, that is a list of lines like:: SOURCE---DESTINATION which represents that ``SOURCE`` is translated to ``DESTINATION`` when found. Note that refextract expects ``SOURCE`` to be normalized, which means removing all non alphanumeric characters, collapsing all contiguous whitespace to one space and uppercasing the resulting string. """ refextract_journal_kb_path = current_app.config['REFEXTRACT_JOURNAL_KB_PATH'] titles_query = db.session.execute(""" SELECT r.json -> 'short_title' AS short_title, r.json -> 'journal_title' -> 'title' AS journal_title FROM records_metadata AS r WHERE (r.json -> '_collections')::jsonb ? 'Journals' """) title_variants_query = db.session.execute(""" SELECT r.json -> 'short_title' AS short_title, jsonb_array_elements((r.json -> 'title_variants')::jsonb) AS title_variant FROM records_metadata AS r WHERE (r.json -> '_collections')::jsonb ? 'Journals' """) with KbWriter(kb_path=refextract_journal_kb_path) as kb_fd: for row in titles_query: kb_fd.add_entry( value=row['short_title'], kb_key=row['short_title'], ) kb_fd.add_entry( value=row['journal_title'], kb_key=row['short_title'], ) for row in title_variants_query: kb_fd.add_entry( value=row['title_variant'], kb_key=row['short_title'], )