diff --git a/doc/connectors/mongo.md b/doc/connectors/mongo.md index a15700a7f..485108a95 100644 --- a/doc/connectors/mongo.md +++ b/doc/connectors/mongo.md @@ -49,7 +49,9 @@ DATA_SOURCES: [ ] ``` -## Note +## Notes + +### Context manager usage The Mongo connector can be used as a context manager to avoid opening and closing a connection to a same database. @@ -68,3 +70,9 @@ with MongoConnector(name='mycon', host='myhost', port=27017) as con: datasource = MongoDataSource(collection='test_col', query=query) con.get_df(datasource) ``` + +### Document count limit + +The Mongo connectors limits the number of counted documents to one million, to +avoid scanning all results of a very large query at each `get_slice` call. +A count of 1M and 1 means that there is more than one million results. diff --git a/tests/mongo/test_mongo.py b/tests/mongo/test_mongo.py index 95a28fe8d..7048cfd28 100644 --- a/tests/mongo/test_mongo.py +++ b/tests/mongo/test_mongo.py @@ -284,6 +284,24 @@ def test_get_slice_empty(mongo_connector, mongo_datasource): assert df.shape == (0, 0) +def test_get_slice_max_count(mongo_connector, mongo_datasource, mocker): + """ + It should limit mongo's count operation to 1M rows + + We're not going to insert a million rows in mongo just for this test, + so we mock the execution of the query. + """ + aggregate = mocker.spy(pymongo.collection.Collection, 'aggregate') + + datasource = mongo_datasource(collection='test_col', query={'domain': 'unknown'}) + df, count = mongo_connector.get_slice(datasource, limit=50) + + aggregate.assert_called_once() + # count facet must be limited + assert '$limit' in aggregate.call_args[0][1][1]['$facet']['count'][0] + assert aggregate.call_args[0][1][1]['$facet']['count'][0]['$limit'] > 0 + + def test_get_df_with_regex(mongo_connector, mongo_datasource): datasource = mongo_datasource(collection='test_col', query={'domain': 'domain1'}) df = mongo_connector.get_df_with_regex(datasource, field='country', regex=re.compile('r.*a')) diff --git a/toucan_connectors/mongo/mongo_connector.py b/toucan_connectors/mongo/mongo_connector.py index 52abebf76..94da49399 100644 --- a/toucan_connectors/mongo/mongo_connector.py +++ b/toucan_connectors/mongo/mongo_connector.py @@ -20,6 +20,8 @@ strlist_to_enum, ) +MAX_COUNTED_ROWS = 1000001 + def normalize_query(query, parameters): query = nosql_apply_parameters_to_query(query, parameters) @@ -226,7 +228,8 @@ def get_slice( df_facet.append({'$limit': limit}) facet = { '$facet': { - 'count': [{'$count': 'value'}], + # counting more than 1M values can be really slow, and the exact number is not that much relevant + 'count': [{'$limit': MAX_COUNTED_ROWS}, {'$count': 'value'}], 'df': df_facet, # df_facet is never empty } }