From 2c0378286bd36587df8c1f57cb3dd3250fdffc3e Mon Sep 17 00:00:00 2001 From: David Nowinsky Date: Mon, 4 May 2020 15:37:24 +0200 Subject: [PATCH 1/3] Prevent mongo connector to count more than 1M rows --- toucan_connectors/mongo/mongo_connector.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/toucan_connectors/mongo/mongo_connector.py b/toucan_connectors/mongo/mongo_connector.py index 52abebf76..94da49399 100644 --- a/toucan_connectors/mongo/mongo_connector.py +++ b/toucan_connectors/mongo/mongo_connector.py @@ -20,6 +20,8 @@ strlist_to_enum, ) +MAX_COUNTED_ROWS = 1000001 + def normalize_query(query, parameters): query = nosql_apply_parameters_to_query(query, parameters) @@ -226,7 +228,8 @@ def get_slice( df_facet.append({'$limit': limit}) facet = { '$facet': { - 'count': [{'$count': 'value'}], + # counting more than 1M values can be really slow, and the exact number is not that much relevant + 'count': [{'$limit': MAX_COUNTED_ROWS}, {'$count': 'value'}], 'df': df_facet, # df_facet is never empty } } From 680be37227662b29fd61fd40a3465cbc564179d5 Mon Sep 17 00:00:00 2001 From: David Nowinsky Date: Mon, 4 May 2020 15:51:11 +0200 Subject: [PATCH 2/3] Document max counted rows for Mongo connector --- doc/connectors/mongo.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/doc/connectors/mongo.md b/doc/connectors/mongo.md index a15700a7f..485108a95 100644 --- a/doc/connectors/mongo.md +++ b/doc/connectors/mongo.md @@ -49,7 +49,9 @@ DATA_SOURCES: [ ] ``` -## Note +## Notes + +### Context manager usage The Mongo connector can be used as a context manager to avoid opening and closing a connection to a same database. @@ -68,3 +70,9 @@ with MongoConnector(name='mycon', host='myhost', port=27017) as con: datasource = MongoDataSource(collection='test_col', query=query) con.get_df(datasource) ``` + +### Document count limit + +The Mongo connectors limits the number of counted documents to one million, to +avoid scanning all results of a very large query at each `get_slice` call. +A count of 1M and 1 means that there is more than one million results. From 2f6af283b3272a96f314a8e5850dbbf43902424c Mon Sep 17 00:00:00 2001 From: David Nowinsky Date: Mon, 4 May 2020 16:28:23 +0200 Subject: [PATCH 3/3] Test count limit for mongo connector --- tests/mongo/test_mongo.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/mongo/test_mongo.py b/tests/mongo/test_mongo.py index 95a28fe8d..7048cfd28 100644 --- a/tests/mongo/test_mongo.py +++ b/tests/mongo/test_mongo.py @@ -284,6 +284,24 @@ def test_get_slice_empty(mongo_connector, mongo_datasource): assert df.shape == (0, 0) +def test_get_slice_max_count(mongo_connector, mongo_datasource, mocker): + """ + It should limit mongo's count operation to 1M rows + + We're not going to insert a million rows in mongo just for this test, + so we mock the execution of the query. + """ + aggregate = mocker.spy(pymongo.collection.Collection, 'aggregate') + + datasource = mongo_datasource(collection='test_col', query={'domain': 'unknown'}) + df, count = mongo_connector.get_slice(datasource, limit=50) + + aggregate.assert_called_once() + # count facet must be limited + assert '$limit' in aggregate.call_args[0][1][1]['$facet']['count'][0] + assert aggregate.call_args[0][1][1]['$facet']['count'][0]['$limit'] > 0 + + def test_get_df_with_regex(mongo_connector, mongo_datasource): datasource = mongo_datasource(collection='test_col', query={'domain': 'domain1'}) df = mongo_connector.get_df_with_regex(datasource, field='country', regex=re.compile('r.*a'))