From 7bc87f8261b25a43e6ce617197becb594f5e8ebb Mon Sep 17 00:00:00 2001
From: David Read <david.read@hackneyworkshop.com>
Date: Fri, 28 Feb 2020 10:30:53 +0000
Subject: [PATCH] Add warning about clearsource_history command and duplicates.
 See also
 https://github.com/ckan/ckanext-harvest/pull/268#issuecomment-592451916

---
 README.rst                             | 22 ++++++++++++----------
 ckanext/harvest/cli.py                 | 15 +++++++++------
 ckanext/harvest/commands/harvester.py  | 12 ++++++++----
 ckanext/harvest/logic/action/update.py | 18 ++++++++++++------
 4 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/README.rst b/README.rst
index 59c6883f1..a20b8ace2 100644
--- a/README.rst
+++ b/README.rst
@@ -219,12 +219,14 @@ The following operations can be run from the command line as described underneat
           but keeps the source itself
 
       harvester clearsource_history [{source-id}]
-        - If no source id is given the history for all harvest sources (maximum is 1000)
-          will be cleared.
-          Clears all jobs and objects related to a harvest source, but keeps the source
-          itself. The datasets imported from the harvest source will **NOT** be deleted!!!
-          If a source id is given, it only clears the history of the harvest source with
-          the given source id.
+        - clears the history for all active harvest sources (up to a maximum of
+          1000 sources), or specify a specific harvest source to clear just
+          that one. All jobs and objects related to the harvest source(s) will
+          be cleared, but it keeps the source itself. This is useful to clean
+          history of long running harvest sources to start again fresh.
+          Warning: The datasets imported from the harvest source will NOT be deleted.
+          They will be disassociated with the harvest source, so if you harvest again
+          it'll create duplicate datasets.
 
       harvester sources [all]
         - lists harvest sources
@@ -253,9 +255,9 @@ The following operations can be run from the command line as described underneat
           import) without involving the web UI or the queue backends. This is
           useful for testing a harvester without having to fire up
           gather/fetch_consumer processes, as is done in production.
-          
+
       harvester run_test {source-id/name} force-import=guid1,guid2...
-        - In order to force an import of particular datasets, useful to 
+        - In order to force an import of particular datasets, useful to
           target a dataset for dev purposes or when forcing imports on other environments.
 
       harvester gather_consumer
@@ -654,10 +656,10 @@ harvester run_test
 You can run a harvester simply using the ``run_test`` command. This is handy
 for running a harvest with one command in the console and see all the output
 in-line. It runs the gather, fetch and import stages all in the same process.
-You must ensure that you have pip installed ``dev-requirements.txt`` 
+You must ensure that you have pip installed ``dev-requirements.txt``
 in ``/home/ckan/ckan/lib/default/src/ckanext-harvest`` before using the
 ``run_test`` command.
-  
+
 This is useful for developing a harvester because you can insert break-points
 in your harvester, and rerun a harvest without having to restart the
 gather_consumer and fetch_consumer processes each time. In addition, because it
diff --git a/ckanext/harvest/cli.py b/ckanext/harvest/cli.py
index 8aa3e3c3e..43d10ceb2 100644
--- a/ckanext/harvest/cli.py
+++ b/ckanext/harvest/cli.py
@@ -110,14 +110,17 @@ def clear(ctx, id):
 @click.argument(u"id", metavar=u"SOURCE_ID_OR_NAME", required=False)
 @click.pass_context
 def clear_history(ctx, id):
-    """If no source id is given the history for all harvest sources
-    (maximum is 1000) will be cleared.
+    """
+    Clears all jobs and objects related to a harvest source, but keeps the
+    source itself. This is useful to clean history of long running harvest
+    sources to start again fresh.
 
-    Clears all jobs and objects related to a harvest source, but keeps
-    the source itself.  The datasets imported from the harvest source
-    will NOT be deleted!!!  If a source id is given, it only clears
-    the history of the harvest source with the given source id.
+    Warning: The datasets imported from the harvest source will NOT be deleted.
+    They will be disassociated with the harvest source, so if you harvest again
+    it'll create duplicate datasets.
 
+    If no source id is given the history for all harvest sources (up to a
+    maximum of 1000 sources) will be cleared.
     """
     flask_app = ctx.meta["flask_app"]
 
diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py
index 5e373f550..e11ab215c 100644
--- a/ckanext/harvest/commands/harvester.py
+++ b/ckanext/harvest/commands/harvester.py
@@ -33,10 +33,14 @@ class Harvester(CkanCommand):
           but keeps the source itself
 
       harvester clearsource_history [{source-id}]
-        - If no source id is given the history for all harvest sources (maximum is 1000) will be cleared.
-          Clears all jobs and objects related to a harvest source, but keeps the source itself.
-          The datasets imported from the harvest source will NOT be deleted!!!
-          If a source id is given, it only clears the history of the harvest source with the given source id.
+        - clears the history for all active harvest sources (up to a maximum of
+          1000 sources), or specify a specific harvest source to clear just
+          that one. All jobs and objects related to the harvest source(s) will
+          be cleared, but it keeps the source itself. This is useful to clean
+          history of long running harvest sources to start again fresh.
+          Warning: The datasets imported from the harvest source will NOT be deleted.
+          They will be disassociated with the harvest source, so if you harvest again
+          it'll create duplicate datasets.
 
       harvester sources [all]
         - lists harvest sources
diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py
index 62fb0a67e..c78fdec80 100644
--- a/ckanext/harvest/logic/action/update.py
+++ b/ckanext/harvest/logic/action/update.py
@@ -236,11 +236,14 @@ def harvest_source_clear(context, data_dict):
 
 def harvest_sources_job_history_clear(context, data_dict):
     '''
-    Clears the history for all active harvest sources. All jobs and objects related to a harvest source will
-    be cleared, but keeps the source itself.
-    This is useful to clean history of long running harvest sources to start again fresh.
-    The datasets imported from the harvest source will NOT be deleted!!!
-
+    Clears the history for all active harvest sources (up to a maximum of
+    1000 sources). All jobs and objects related to a harvest source will be
+    cleared, but it keeps the source itself. This is useful to clean history of
+    long running harvest sources to start again fresh.
+
+    Warning: The datasets imported from the harvest source will NOT be deleted.
+    They will be disassociated with the harvest source, so if you harvest again
+    it'll create duplicate datasets.
     '''
     check_access('harvest_sources_clear', context, data_dict)
 
@@ -265,7 +268,10 @@ def harvest_source_job_history_clear(context, data_dict):
     '''
     Clears all jobs and objects related to a harvest source, but keeps the source itself.
     This is useful to clean history of long running harvest sources to start again fresh.
-    The datasets imported from the harvest source will NOT be deleted!!!
+
+    Warning: The datasets imported from the harvest source will NOT be deleted.
+    They will be disassociated with the harvest source, so if you harvest again
+    it'll create duplicate datasets.
 
     :param id: the id of the harvest source to clear
     :type id: string