From a927012057738095ca83697b5c74980d72214ee5 Mon Sep 17 00:00:00 2001 From: Mohizur Khan Date: Wed, 29 May 2024 16:41:53 +0100 Subject: [PATCH] feat: logging during ingest In some cases people think pg-bulk-ingest is either doing nothing during ingest, or loading all the data into memory. Adding in a bit more logging to make it clearer data is going into the database, including interleaved with fetching data from source (due to the magic of iterables/generators). The complex logic on when to log is an attempt to make the logging suitable for both small and large numbers of rows without per ingest configuration, and keeping the noise down in the logs. It logs every 100 rows initially, going up to logging every million rows later in the ingest. For example it could log: Ingesting from source into the database... Ingested 100 rows... Ingested 200 rows... Ingested 300 rows... Ingested 400 rows... Ingested 500 rows... Ingested 600 rows... Ingested 700 rows... Ingested 800 rows... Ingested 900 rows... Ingested 1000 rows... Ingested 2000 rows... Ingested 3000 rows... Ingested 4000 rows... Ingested 5000 rows... Ingested 6000 rows... Ingested 7000 rows... Ingested 8000 rows... Ingested 9000 rows... Ingested 10000 rows... Ingested 20000 rows... Ingested 30000 rows... Ingested 40000 rows... Ingested 50000 rows... Ingested 60000 rows... Ingested 70000 rows... Ingested 80000 rows... Ingested 90000 rows... Ingested 100000 rows... Ingested 200000 rows... Ingested 300000 rows... Ingested 400000 rows... Ingested 500000 rows... Ingested 600000 rows... Ingested 700000 rows... Ingested 800000 rows... Ingested 900000 rows... Ingested 1000000 rows... Ingested 2000000 rows... Ingested 3000000 rows... Ingested 4000000 rows... Ingested 5000000 rows... Ingested 6000000 rows... Ingested 7000000 rows... Ingested 8000000 rows... Ingested 9000000 rows... Ingested 10000000 rows... Ingested 11000000 rows... Ingested 12000000 rows... Ingested 12312312 rows in total Co-authored-by: Tash Boyse <57753415+nboyse@users.noreply.github.com> Co-authored-by: Michal Charemza Co-authored-by: Mohizur Khan Co-authored-by: Josh Wong <166488409+joshwong-cs@users.noreply.github.com> --- pg_bulk_ingest.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pg_bulk_ingest.py b/pg_bulk_ingest.py index 0d7a378..6f0fcc6 100644 --- a/pg_bulk_ingest.py +++ b/pg_bulk_ingest.py @@ -1,5 +1,6 @@ import uuid import logging +import math from collections import deque, defaultdict from contextlib import contextmanager from enum import Enum @@ -258,8 +259,20 @@ def escape_string(text): else: return lambda v: (null if v is None else escape_string(str(v))) + def logged(rows): + i = None + logger.info("Ingesting from source into the database...") + for i, row in enumerate(rows): + yield row + # Logs frequently for small numbers of rows, less frequently for larger + if (i + 1) % (10 ** min((max((math.floor(math.log10(i + 1))), 2)), 6)) == 0: + logger.info("Ingested %s rows...", i + 1) + + total = i + 1 if i is not None else 0 + logger.info("Ingested %s rows in total", total) + converters = tuple(get_converter(column.type) for column in batch_table.columns) - db_rows = ( + db_rows = logged( '\t'.join(converter(value) for (converter,value) in zip(converters, row)) + '\n' for row_table, row in rows if row_table is user_facing_table