Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent mongo connector to count more than 1M rows #159

Merged
merged 3 commits into from
May 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion doc/connectors/mongo.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,9 @@ DATA_SOURCES: [
]
```

## Note
## Notes

### Context manager usage

The Mongo connector can be used as a context manager to avoid opening
and closing a connection to a same database.
Expand All @@ -68,3 +70,9 @@ with MongoConnector(name='mycon', host='myhost', port=27017) as con:
datasource = MongoDataSource(collection='test_col', query=query)
con.get_df(datasource)
```

### Document count limit

The Mongo connectors limits the number of counted documents to one million, to
avoid scanning all results of a very large query at each `get_slice` call.
A count of 1M and 1 means that there is more than one million results.
18 changes: 18 additions & 0 deletions tests/mongo/test_mongo.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,24 @@ def test_get_slice_empty(mongo_connector, mongo_datasource):
assert df.shape == (0, 0)


def test_get_slice_max_count(mongo_connector, mongo_datasource, mocker):
"""
It should limit mongo's count operation to 1M rows

We're not going to insert a million rows in mongo just for this test,
so we mock the execution of the query.
"""
aggregate = mocker.spy(pymongo.collection.Collection, 'aggregate')

datasource = mongo_datasource(collection='test_col', query={'domain': 'unknown'})
df, count = mongo_connector.get_slice(datasource, limit=50)

aggregate.assert_called_once()
# count facet must be limited
assert '$limit' in aggregate.call_args[0][1][1]['$facet']['count'][0]
assert aggregate.call_args[0][1][1]['$facet']['count'][0]['$limit'] > 0


def test_get_df_with_regex(mongo_connector, mongo_datasource):
datasource = mongo_datasource(collection='test_col', query={'domain': 'domain1'})
df = mongo_connector.get_df_with_regex(datasource, field='country', regex=re.compile('r.*a'))
Expand Down
5 changes: 4 additions & 1 deletion toucan_connectors/mongo/mongo_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
strlist_to_enum,
)

MAX_COUNTED_ROWS = 1000001


def normalize_query(query, parameters):
query = nosql_apply_parameters_to_query(query, parameters)
Expand Down Expand Up @@ -226,7 +228,8 @@ def get_slice(
df_facet.append({'$limit': limit})
facet = {
'$facet': {
'count': [{'$count': 'value'}],
# counting more than 1M values can be really slow, and the exact number is not that much relevant
'count': [{'$limit': MAX_COUNTED_ROWS}, {'$count': 'value'}],
'df': df_facet, # df_facet is never empty
}
}
Expand Down