fix(scan): fix clocked and scheduled scan not working (#182)

* fix(scan): fix clocked and scheduled scan not working * fix(scan): store start datetime of schedule scan in UTC * fix(celery): add more DEBUG error for celery beat (datetime of tasks ...) * style(time): add UTC to the schedule time
Security-Tools-Alliance · Sep 4, 2024 · 4d35400 · 4d35400
1 parent 7ff6d01
commit 4d35400
Show file tree

Hide file tree

Showing 6 changed files with 222 additions and 161 deletions.
diff --git a/web/reNgine/common_func.py b/web/reNgine/common_func.py
@@ -1139,3 +1139,31 @@ def extract_columns(row, columns):
         list: Extracted values from the specified columns.
     """
     return [row[i] for i in columns]
+
+def create_scan_object(host_id, engine_id, initiated_by_id=None):
+    '''
+    create task with pending status so that celery task will execute when
+    threads are free
+    Args:
+        host_id: int: id of Domain model
+        engine_id: int: id of EngineType model
+        initiated_by_id: int : id of User model (Optional)
+    '''
+    # get current time
+    current_scan_time = timezone.now()
+    # fetch engine and domain object
+    engine = EngineType.objects.get(pk=engine_id)
+    domain = Domain.objects.get(pk=host_id)
+    scan = ScanHistory()
+    scan.scan_status = INITIATED_TASK
+    scan.domain = domain
+    scan.scan_type = engine
+    scan.start_scan_date = current_scan_time
+    if initiated_by_id:
+        user = User.objects.get(pk=initiated_by_id)
+        scan.initiated_by = user
+    scan.save()
+    # save last scan date for domain model
+    domain.start_scan_date = current_scan_time
+    domain.save()
+    return scan.id
diff --git a/web/reNgine/settings.py b/web/reNgine/settings.py
@@ -171,10 +171,6 @@
 USE_L10N = True
 USE_TZ = True
 
-# Temporary fix for celery beat crash
-# See https://github.com/yogeshojha/rengine/issues/971
-DJANGO_CELERY_BEAT_TZ_AWARE = False
-
 MEDIA_URL = '/media/'
 FILE_UPLOAD_MAX_MEMORY_SIZE = 100000000
 FILE_UPLOAD_PERMISSIONS = 0o644
@@ -264,6 +260,13 @@
             'filename': 'celery.log',
             'maxBytes': 1024 * 1024 * 100,  # 100 mb
         },
+        'celery_beat': {
+            'class': 'logging.handlers.RotatingFileHandler',
+            'formatter': 'simple',
+            'filename': 'celery_beat.log',
+            'maxBytes': 1024 * 1024 * 100,  # 100 mb
+            'backupCount': 5,
+        },
     },
     'formatters': {
         'default': {
@@ -328,6 +331,11 @@
             'handlers': ['null'],
             'propagate': False,
         },
+        'django_celery_beat': {
+            'handlers': ['celery_beat', 'console'],
+            'level': 'DEBUG',
+            'propagate': True,
+        },
     },
     'root': {
         'handlers': ['console'],

diff --git a/web/reNgine/tasks.py b/web/reNgine/tasks.py
@@ -63,6 +63,7 @@ def initiate_scan(
         results_dir=RENGINE_RESULTS,
         imported_subdomains=[],
         out_of_scope_subdomains=[],
+        initiated_by_id=None,
         url_filter=''):
     """Initiate a new scan.
 
@@ -74,134 +75,149 @@ def initiate_scan(
         results_dir (str): Results directory.
         imported_subdomains (list): Imported subdomains.
         out_of_scope_subdomains (list): Out-of-scope subdomains.
-        url_filter (str): URL path. Default: ''
+        url_filter (str): URL path. Default: ''.
+		initiated_by (int): User ID initiating the scan.
     """
 
     if CELERY_REMOTE_DEBUG:
         debug()
 
-    # Get scan history
-    scan = ScanHistory.objects.get(pk=scan_history_id)
+    logger.info('Initiating scan on celery')
+    scan = None
+    try:
+        # Get scan engine
+        engine_id = engine_id or scan.scan_type.id # scan history engine_id
+        engine = EngineType.objects.get(pk=engine_id)
 
-    # Get scan engine
-    engine_id = engine_id or scan.scan_type.id # scan history engine_id
-    engine = EngineType.objects.get(pk=engine_id)
+        # Get YAML config
+        config = yaml.safe_load(engine.yaml_configuration)
+        enable_http_crawl = config.get(ENABLE_HTTP_CRAWL, DEFAULT_ENABLE_HTTP_CRAWL)
+        gf_patterns = config.get(GF_PATTERNS, [])
 
-    # Get YAML config
-    config = yaml.safe_load(engine.yaml_configuration)
-    enable_http_crawl = config.get(ENABLE_HTTP_CRAWL, DEFAULT_ENABLE_HTTP_CRAWL)
-    gf_patterns = config.get(GF_PATTERNS, [])
+        # Get domain and set last_scan_date
+        domain = Domain.objects.get(pk=domain_id)
+        domain.last_scan_date = timezone.now()
+        domain.save()
 
-    # Get domain and set last_scan_date
-    domain = Domain.objects.get(pk=domain_id)
-    domain.last_scan_date = timezone.now()
-    domain.save()
+        # Get path filter
+        url_filter = url_filter.rstrip('/')
 
-    # Get path filter
-    url_filter = url_filter.rstrip('/')
+        # for live scan scan history id is passed as scan_history_id 
+        # and no need to create scan_history object
 
-    # Get or create ScanHistory() object
-    if scan_type == LIVE_SCAN: # immediate
+        if scan_type == SCHEDULED_SCAN: # scheduled
+            # we need to create scan_history object for each scheduled scan 
+            scan_history_id = create_scan_object(
+                host_id=domain_id,
+                engine_id=engine_id,
+                initiated_by_id=initiated_by_id,
+            )
         scan = ScanHistory.objects.get(pk=scan_history_id)
         scan.scan_status = RUNNING_TASK
-    elif scan_type == SCHEDULED_SCAN: # scheduled
-        scan = ScanHistory()
-        scan.scan_status = INITIATED_TASK
-    scan.scan_type = engine
-    scan.celery_ids = [initiate_scan.request.id]
-    scan.domain = domain
-    scan.start_scan_date = timezone.now()
-    scan.tasks = engine.tasks
-    uuid_scan = uuid.uuid1()
-    scan.results_dir = f'{results_dir}/{domain.name}/scans/{uuid_scan}'
-    add_gf_patterns = gf_patterns and 'fetch_url' in engine.tasks
-    if add_gf_patterns and is_iterable(gf_patterns):
-        scan.used_gf_patterns = ','.join(gf_patterns)
-    scan.save()
 
-    try:
+        scan.scan_type = engine
+        scan.celery_ids = [initiate_scan.request.id]
+        scan.domain = domain
+        scan.start_scan_date = timezone.now()
+        scan.tasks = engine.tasks
+        uuid_scan = uuid.uuid1()
+        scan.results_dir = f'{results_dir}/{domain.name}/scans/{uuid_scan}'
+        add_gf_patterns = gf_patterns and 'fetch_url' in engine.tasks
+        if add_gf_patterns and is_iterable(gf_patterns):
+            scan.used_gf_patterns = ','.join(gf_patterns)
+        scan.save()
+
+        # Create scan results dir
         os.makedirs(scan.results_dir, exist_ok=True)
-    except:
-        import traceback
-
-        traceback.print_exc()
-        raise
 
-    # Build task context
-    ctx = {
-        'scan_history_id': scan_history_id,
-        'engine_id': engine_id,
-        'domain_id': domain.id,
-        'results_dir': scan.results_dir,
-        'url_filter': url_filter,
-        'yaml_configuration': config,
-        'out_of_scope_subdomains': out_of_scope_subdomains
-    }
-    ctx_str = json.dumps(ctx, indent=2)
-
-    # Send start notif
-    logger.warning(f'Starting scan {scan_history_id} with context:\n{ctx_str}')
-    send_scan_notif.delay(
-        scan_history_id,
-        subscan_id=None,
-        engine_id=engine_id,
-        status=CELERY_TASK_STATUS_MAP[scan.scan_status])
+        # Build task context
+        ctx = {
+            'scan_history_id': scan_history_id,
+            'engine_id': engine_id,
+            'domain_id': domain.id,
+            'results_dir': scan.results_dir,
+            'url_filter': url_filter,
+            'yaml_configuration': config,
+            'out_of_scope_subdomains': out_of_scope_subdomains
+        }
+        ctx_str = json.dumps(ctx, indent=2)
+
+        # Send start notif
+        logger.warning(f'Starting scan {scan_history_id} with context:\n{ctx_str}')
+        send_scan_notif.delay(
+            scan_history_id,
+            subscan_id=None,
+            engine_id=engine_id,
+            status=CELERY_TASK_STATUS_MAP[scan.scan_status])
+
+        # Save imported subdomains in DB
+        save_imported_subdomains(imported_subdomains, ctx=ctx)
+
+        # Create initial subdomain in DB: make a copy of domain as a subdomain so
+        # that other tasks using subdomains can use it.
+        subdomain_name = domain.name
+        subdomain, _ = save_subdomain(subdomain_name, ctx=ctx)
 
-    # Save imported subdomains in DB
-    save_imported_subdomains(imported_subdomains, ctx=ctx)
 
-    # Create initial subdomain in DB: make a copy of domain as a subdomain so
-    # that other tasks using subdomains can use it.
-    subdomain_name = domain.name
-    subdomain, _ = save_subdomain(subdomain_name, ctx=ctx)
+        # If enable_http_crawl is set, create an initial root HTTP endpoint so that
+        # HTTP crawling can start somewhere
+        http_url = f'{domain.name}{url_filter}' if url_filter else domain.name
+        endpoint, _ = save_endpoint(
+            http_url,
+            ctx=ctx,
+            crawl=enable_http_crawl,
+            is_default=True,
+            subdomain=subdomain
+        )
 
-    # If enable_http_crawl is set, create an initial root HTTP endpoint so that
-    # HTTP crawling can start somewhere
-    http_url = f'{domain.name}{url_filter}' if url_filter else domain.name
-    endpoint, _ = save_endpoint(
-        http_url,
-        ctx=ctx,
-        crawl=enable_http_crawl,
-        is_default=True,
-        subdomain=subdomain
-    )
-    save_subdomain_metadata(subdomain, endpoint)
-
-    # Build Celery tasks, crafted according to the dependency graph below:
-    # subdomain_discovery --> port_scan --> fetch_url --> dir_file_fuzz
-    # osint								             	  vulnerability_scan
-    # osint								             	  dalfox xss scan
-    #						 	   		         	  	  screenshot
-    #													  waf_detection
-    workflow = chain(
-        group(
-            subdomain_discovery.si(ctx=ctx, description='Subdomain discovery'),
-            osint.si(ctx=ctx, description='OS Intelligence')
-        ),
-        port_scan.si(ctx=ctx, description='Port scan'),
-        fetch_url.si(ctx=ctx, description='Fetch URL'),
-        group(
-            dir_file_fuzz.si(ctx=ctx, description='Directories & files fuzz'),
-            vulnerability_scan.si(ctx=ctx, description='Vulnerability scan'),
-            screenshot.si(ctx=ctx, description='Screenshot'),
-            waf_detection.si(ctx=ctx, description='WAF detection')
+        save_subdomain_metadata(subdomain, endpoint)
+
+
+        # Build Celery tasks, crafted according to the dependency graph below:
+        # subdomain_discovery --> port_scan --> fetch_url --> dir_file_fuzz
+        # osint								             	  vulnerability_scan
+        # osint								             	  dalfox xss scan
+        #						 	   		         	  	  screenshot
+        #													  waf_detection
+        workflow = chain(
+            group(
+                subdomain_discovery.si(ctx=ctx, description='Subdomain discovery'),
+                osint.si(ctx=ctx, description='OS Intelligence')
+            ),
+            port_scan.si(ctx=ctx, description='Port scan'),
+            fetch_url.si(ctx=ctx, description='Fetch URL'),
+            group(
+                dir_file_fuzz.si(ctx=ctx, description='Directories & files fuzz'),
+                vulnerability_scan.si(ctx=ctx, description='Vulnerability scan'),
+                screenshot.si(ctx=ctx, description='Screenshot'),
+                waf_detection.si(ctx=ctx, description='WAF detection')
+            )
         )
-    )
 
-    # Build callback
-    callback = report.si(ctx=ctx).set(link_error=[report.si(ctx=ctx)])
+        # Build callback
+        callback = report.si(ctx=ctx).set(link_error=[report.si(ctx=ctx)])
 
-    # Run Celery chord
-    logger.info(f'Running Celery workflow with {len(workflow.tasks) + 1} tasks')
-    task = chain(workflow, callback).on_error(callback).delay()
-    scan.celery_ids.append(task.id)
-    scan.save()
+        # Run Celery chord
+        logger.info(f'Running Celery workflow with {len(workflow.tasks) + 1} tasks')
+        task = chain(workflow, callback).on_error(callback).delay()
+        scan.celery_ids.append(task.id)
+        scan.save()
 
-    return {
-        'success': True,
-        'task_id': task.id
-    }
+        return {
+            'success': True,
+            'task_id': task.id
+        }
 
+    except Exception as e:
+        logger.exception(e)
+        if scan:
+            scan.scan_status = FAILED_TASK
+            scan.error_message = str(e)
+            scan.save()
+        return {
+            'success': False,
+            'error': str(e)
+        }
 
 @app.task(name='initiate_subscan', bind=False, queue='subscan_queue')
 def initiate_subscan(

diff --git a/web/startScan/templates/startScan/schedule_scan_list.html b/web/startScan/templates/startScan/schedule_scan_list.html
@@ -48,7 +48,7 @@
               Will run exactly at {{ task.clocked.clocked_time}} UTC
               {% endif %}
             </td>
-            <td>{{ task.last_run_at|none_or_never }}</td>
+            <td>{{ task.last_run_at|none_or_never }} UTC</td>
             <td class="text-center">{{ task.total_run_count }}</td>
             <td class="text-center">
               {% if task.one_off %}

diff --git a/web/startScan/templates/startScan/schedule_scan_ui.html b/web/startScan/templates/startScan/schedule_scan_ui.html
@@ -205,5 +205,19 @@ <h4 class="text-warning">Out of Scope Subdomains(Optional)</h4>
   });
 });
 
+</script>
+<script>
+document.addEventListener('DOMContentLoaded', function() {
+    var form = document.getElementById('start-scan-form');
+    if (form) {
+        var timezoneOffsetField = document.createElement('input');
+        timezoneOffsetField.type = 'hidden';
+        timezoneOffsetField.name = 'timezone_offset';
+        timezoneOffsetField.value = new Date().getTimezoneOffset();
+        form.appendChild(timezoneOffsetField);
+    } else {
+        console.error("The form with the ID 'start-scan-form' was not found.");
+    }
+});
 </script>
 {% endblock page_level_script %}