Source code for inspirehep.modules.migrator.cli

# -*- coding: utf-8 -*-
#
# This file is part of INSPIRE.
# Copyright (C) 2014-2017 CERN.
#
# INSPIRE is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# INSPIRE is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with INSPIRE. If not, see <http://www.gnu.org/licenses/>.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Manage migrator from INSPIRE legacy instance."""

from __future__ import absolute_import, division, print_function

import csv
import os
import sys
import traceback
import warnings

from itertools import dropwhile
from textwrap import dedent

import click
import jsonschema

from dojson.contrib.marc21.utils import create_record

from flask import current_app
from flask.cli import with_appcontext

from inspire_dojson import marcxml2record
from inspire_schemas.api import validate

from inspirehep.utils.schema import ensure_valid_schema
from .models import LegacyRecordsMirror
from .tasks import (
    migrate_from_file,
    migrate_from_mirror,
    migrate_record_from_legacy,
    populate_mirror_from_file,
)
from .utils import get_collection


[docs]def halt_if_debug_mode(force): message = '''\ The application is running in debug mode, which leaks memory when doing many database operations. To avoid problems, disable debug mode. This can be done by setting "DEBUG=False" in the config or setting the environment variable "APP_DEBUG=False". If you know what you are doing, you can pass the "--force" flag to disable this check. ''' if not force and current_app.config.get('DEBUG'): click.echo(dedent(message), err=True) sys.exit(1)
@click.group() def migrate(): """Commands to migrate records from legacy.""" @migrate.command('file') @click.argument('file_name', type=click.Path(exists=True, dir_okay=False, resolve_path=True)) @click.option('--mirror-only', '-m', is_flag=True, default=False, help='Only mirror the records without instead of doing a full migration.') @click.option('--wait', '-w', is_flag=True, default=False, help='Wait for migration to complete. This only has an effect if the -w flag is not set.') @click.option('-f', '--force', is_flag=True, default=False, help='Force the task to run even in debug mode.') @with_appcontext def migrate_file(file_name, mirror_only=False, wait=False, force=False): """Migrate the records in the provided file. The file can be an (optionally-gzipped) XML file containing MARCXML, or a prodsync tarball. """ halt_if_debug_mode(force=force) click.echo("Migrating records from file: {0}".format(file_name)) populate_mirror_from_file(file_name) if not mirror_only: migrate_from_mirror(wait_for_results=wait) @migrate.command() @click.option('--all', '-a', 'also_migrate', flag_value='all', help='Migrate all records, irrespective of their status.') @click.option('--broken', '-b', 'also_migrate', flag_value='broken', help='Also migrate broken records, which did not migrate correctly in the previous run.') @click.option('--wait', '-w', is_flag=True, default=False, help='Wait for migration to complete. This only has an effect if the -w flag is not set.') @click.option('-f', '--force', is_flag=True, default=False, help='Force the task to run even in debug mode.') @with_appcontext def mirror(also_migrate=None, wait=False, force=False): """Migrate records from the mirror. By default, only records that have not been migrated yet are migrated. """ halt_if_debug_mode(force=force) migrate_from_mirror(also_migrate=also_migrate, wait_for_results=wait) @migrate.command() @click.argument('recid', type=int) @with_appcontext def record(recid): """Migrate a single record from legacy.""" click.echo('Migrating record {recid} from INSPIRE legacy'.format(recid=recid)) migrate_record_from_legacy(recid) @click.group() def migrator(): """DEPRECATED Command related to migrating INSPIRE data.""" warnings.warn('The "migrator" command is deprecated, use "migrate" instead.', DeprecationWarning) @migrator.command() @click.option('--file-input', '-f', help='Specific collections to migrate.') @click.option('--remigrate-broken', '-b', is_flag=True, default=False, help='Try to remigrate broken records') @click.option('--remigrate-all', '-a', is_flag=True, default=False, help='Remigrate all records') @click.option('--wait', '-w', type=bool, default=False, help='Wait for migrator to complete.') def populate(file_input=None, remigrate_broken=False, remigrate_all=False, wait=False): """DEPRECATED Populates the system with records from migrator files. Usage: inveniomanage migrator populate -f prodsync20151117173222.xml.gz """ warnings.warn('The "migrator" command is deprecated, use "migrate" instead.', DeprecationWarning) if remigrate_broken: click.echo("Remigrate broken records...") migrate_from_mirror.delay(only_broken=True) elif remigrate_all: click.echo("Remigrate all records...") migrate_from_mirror.delay(only_broken=False) elif file_input and not os.path.isfile(file_input): click.echo("{0} is not a file!".format(file_input), err=True) elif file_input: click.echo("Migrating records from file: {0}".format(file_input)) migrate_from_file(os.path.abspath(file_input), wait_for_results=wait) @migrator.command() @click.option('--recid', '-r', type=int, help="recid on INSPIRE") def one(recid): """DEPRECATED Migrate one record from legacy.""" warnings.warn('The "migrator" command is deprecated, use "migrate" instead.', DeprecationWarning) click.echo('Migrating record {recid} from INSPIRE legacy'.format(recid=recid)) migrate_record_from_legacy(recid) @migrator.command() @click.option('--output', '-o', default="/tmp/broken-records.csv", help='Specifiy where to report errors.') @with_appcontext def reporterrors(output): """Reports in a friendly way all failed records and corresponding motivation.""" click.echo("Reporting broken records into {0}".format(output)) errors = {} results = LegacyRecordsMirror.query.filter(LegacyRecordsMirror.valid == False) # noqa: ignore=F712 results_length = results.count() with click.progressbar(results.yield_per(100), length=results_length) as bar: for obj in bar: marc_record = create_record(obj.marcxml, keep_singletons=False) collection = get_collection(marc_record) if 'DELETED' in collection: continue recid = int(marc_record['001']) try: json_record = marcxml2record(obj.marcxml) except Exception as err: tb = u''.join(traceback.format_tb(sys.exc_info()[2])) errors.setdefault((collection, 'dojson', tb), []).append(recid) continue ensure_valid_schema(json_record) try: validate(json_record) except jsonschema.exceptions.ValidationError as err: exc = [ row for row in str(err).splitlines() if row.startswith('Failed validating') ][0] details = u'\n'.join( dropwhile( lambda x: not x.startswith('On instance'), str(err).splitlines() ) ) errors.setdefault( (collection, 'validation', exc), [] ).append((recid, details)) continue with open(output, "w") as out: csv_writer = csv.writer(out) for (collection, stage, error), elements in errors.iteritems(): if stage == 'dojson': csv_writer.writerow(( collection, stage, error, '\n'.join( 'http://inspirehep.net/record/{}'.format(recid) for recid in elements ) )) else: for recid, details in elements: csv_writer.writerow(( collection, stage, error, 'http://inspirehep.net/record/{}'.format(recid), details )) click.echo("Dumped errors into {}".format(output))