ToucanToco · davinov · May 7, 2020 · May 4, 2020 · May 4, 2020 · May 4, 2020
@@ -49,7 +49,9 @@ DATA_SOURCES: [
 ]
 ```
 
-## Note
+## Notes
+
+### Context manager usage
 
 The Mongo connector can be used as a context manager to avoid opening
 and closing a connection to a same database.
@@ -68,3 +70,9 @@ with MongoConnector(name='mycon', host='myhost', port=27017) as con:
         datasource = MongoDataSource(collection='test_col', query=query)
         con.get_df(datasource)
 ```
+
+### Document count limit
+
+The Mongo connectors limits the number of counted documents to one million, to
+avoid scanning all results of a very large query at each `get_slice` call.
+A count of 1M and 1 means that there is more than one million results.
@@ -284,6 +284,24 @@ def test_get_slice_empty(mongo_connector, mongo_datasource):
     assert df.shape == (0, 0)
 
 
+def test_get_slice_max_count(mongo_connector, mongo_datasource, mocker):
+    """
+    It should limit mongo's count operation to 1M rows
+
+    We're not going to insert a million rows in mongo just for this test,
+    so we mock the execution of the query.
+    """
+    aggregate = mocker.spy(pymongo.collection.Collection, 'aggregate')
+
+    datasource = mongo_datasource(collection='test_col', query={'domain': 'unknown'})
+    df, count = mongo_connector.get_slice(datasource, limit=50)
+
+    aggregate.assert_called_once()
+    # count facet must be limited
+    assert '$limit' in aggregate.call_args[0][1][1]['$facet']['count'][0]
+    assert aggregate.call_args[0][1][1]['$facet']['count'][0]['$limit'] > 0
+
+
 def test_get_df_with_regex(mongo_connector, mongo_datasource):
     datasource = mongo_datasource(collection='test_col', query={'domain': 'domain1'})
     df = mongo_connector.get_df_with_regex(datasource, field='country', regex=re.compile('r.*a'))

@@ -20,6 +20,8 @@
     strlist_to_enum,
 )
 
+MAX_COUNTED_ROWS = 1000001
+
 
 def normalize_query(query, parameters):
     query = nosql_apply_parameters_to_query(query, parameters)
@@ -226,7 +228,8 @@ def get_slice(
                 df_facet.append({'$limit': limit})
             facet = {
                 '$facet': {
-                    'count': [{'$count': 'value'}],
+                    # counting more than 1M values can be really slow, and the exact number is not that much relevant
+                    'count': [{'$limit': MAX_COUNTED_ROWS}, {'$count': 'value'}],
                     'df': df_facet,  # df_facet is never empty
                 }
             }