From 7bc87f8261b25a43e6ce617197becb594f5e8ebb Mon Sep 17 00:00:00 2001 From: David Read Date: Fri, 28 Feb 2020 10:30:53 +0000 Subject: [PATCH] Add warning about clearsource_history command and duplicates. See also https://github.com/ckan/ckanext-harvest/pull/268#issuecomment-592451916 --- README.rst | 22 ++++++++++++---------- ckanext/harvest/cli.py | 15 +++++++++------ ckanext/harvest/commands/harvester.py | 12 ++++++++---- ckanext/harvest/logic/action/update.py | 18 ++++++++++++------ 4 files changed, 41 insertions(+), 26 deletions(-) diff --git a/README.rst b/README.rst index 59c6883f1..a20b8ace2 100644 --- a/README.rst +++ b/README.rst @@ -219,12 +219,14 @@ The following operations can be run from the command line as described underneat but keeps the source itself harvester clearsource_history [{source-id}] - - If no source id is given the history for all harvest sources (maximum is 1000) - will be cleared. - Clears all jobs and objects related to a harvest source, but keeps the source - itself. The datasets imported from the harvest source will **NOT** be deleted!!! - If a source id is given, it only clears the history of the harvest source with - the given source id. + - clears the history for all active harvest sources (up to a maximum of + 1000 sources), or specify a specific harvest source to clear just + that one. All jobs and objects related to the harvest source(s) will + be cleared, but it keeps the source itself. This is useful to clean + history of long running harvest sources to start again fresh. + Warning: The datasets imported from the harvest source will NOT be deleted. + They will be disassociated with the harvest source, so if you harvest again + it'll create duplicate datasets. harvester sources [all] - lists harvest sources @@ -253,9 +255,9 @@ The following operations can be run from the command line as described underneat import) without involving the web UI or the queue backends. This is useful for testing a harvester without having to fire up gather/fetch_consumer processes, as is done in production. - + harvester run_test {source-id/name} force-import=guid1,guid2... - - In order to force an import of particular datasets, useful to + - In order to force an import of particular datasets, useful to target a dataset for dev purposes or when forcing imports on other environments. harvester gather_consumer @@ -654,10 +656,10 @@ harvester run_test You can run a harvester simply using the ``run_test`` command. This is handy for running a harvest with one command in the console and see all the output in-line. It runs the gather, fetch and import stages all in the same process. -You must ensure that you have pip installed ``dev-requirements.txt`` +You must ensure that you have pip installed ``dev-requirements.txt`` in ``/home/ckan/ckan/lib/default/src/ckanext-harvest`` before using the ``run_test`` command. - + This is useful for developing a harvester because you can insert break-points in your harvester, and rerun a harvest without having to restart the gather_consumer and fetch_consumer processes each time. In addition, because it diff --git a/ckanext/harvest/cli.py b/ckanext/harvest/cli.py index 8aa3e3c3e..43d10ceb2 100644 --- a/ckanext/harvest/cli.py +++ b/ckanext/harvest/cli.py @@ -110,14 +110,17 @@ def clear(ctx, id): @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME", required=False) @click.pass_context def clear_history(ctx, id): - """If no source id is given the history for all harvest sources - (maximum is 1000) will be cleared. + """ + Clears all jobs and objects related to a harvest source, but keeps the + source itself. This is useful to clean history of long running harvest + sources to start again fresh. - Clears all jobs and objects related to a harvest source, but keeps - the source itself. The datasets imported from the harvest source - will NOT be deleted!!! If a source id is given, it only clears - the history of the harvest source with the given source id. + Warning: The datasets imported from the harvest source will NOT be deleted. + They will be disassociated with the harvest source, so if you harvest again + it'll create duplicate datasets. + If no source id is given the history for all harvest sources (up to a + maximum of 1000 sources) will be cleared. """ flask_app = ctx.meta["flask_app"] diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 5e373f550..e11ab215c 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -33,10 +33,14 @@ class Harvester(CkanCommand): but keeps the source itself harvester clearsource_history [{source-id}] - - If no source id is given the history for all harvest sources (maximum is 1000) will be cleared. - Clears all jobs and objects related to a harvest source, but keeps the source itself. - The datasets imported from the harvest source will NOT be deleted!!! - If a source id is given, it only clears the history of the harvest source with the given source id. + - clears the history for all active harvest sources (up to a maximum of + 1000 sources), or specify a specific harvest source to clear just + that one. All jobs and objects related to the harvest source(s) will + be cleared, but it keeps the source itself. This is useful to clean + history of long running harvest sources to start again fresh. + Warning: The datasets imported from the harvest source will NOT be deleted. + They will be disassociated with the harvest source, so if you harvest again + it'll create duplicate datasets. harvester sources [all] - lists harvest sources diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 62fb0a67e..c78fdec80 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -236,11 +236,14 @@ def harvest_source_clear(context, data_dict): def harvest_sources_job_history_clear(context, data_dict): ''' - Clears the history for all active harvest sources. All jobs and objects related to a harvest source will - be cleared, but keeps the source itself. - This is useful to clean history of long running harvest sources to start again fresh. - The datasets imported from the harvest source will NOT be deleted!!! - + Clears the history for all active harvest sources (up to a maximum of + 1000 sources). All jobs and objects related to a harvest source will be + cleared, but it keeps the source itself. This is useful to clean history of + long running harvest sources to start again fresh. + + Warning: The datasets imported from the harvest source will NOT be deleted. + They will be disassociated with the harvest source, so if you harvest again + it'll create duplicate datasets. ''' check_access('harvest_sources_clear', context, data_dict) @@ -265,7 +268,10 @@ def harvest_source_job_history_clear(context, data_dict): ''' Clears all jobs and objects related to a harvest source, but keeps the source itself. This is useful to clean history of long running harvest sources to start again fresh. - The datasets imported from the harvest source will NOT be deleted!!! + + Warning: The datasets imported from the harvest source will NOT be deleted. + They will be disassociated with the harvest source, so if you harvest again + it'll create duplicate datasets. :param id: the id of the harvest source to clear :type id: string