Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BigQuery: add --max_results option to magic #9169

Merged
merged 7 commits into from
Sep 6, 2019
30 changes: 27 additions & 3 deletions bigquery/google/cloud/bigquery/magics.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,7 +266,7 @@ def default_query_job_config(self, value):
context = Context()


def _run_query(client, query, job_config=None):
def _run_query(client, query, job_config=None, max_results=None):
"""Runs a query while printing status updates

Args:
Expand Down Expand Up @@ -300,7 +300,7 @@ def _run_query(client, query, job_config=None):
while True:
print("\rQuery executing: {:0.2f}s".format(time.time() - start_time), end="")
try:
query_job.result(timeout=0.5)
query_job.result(timeout=0.5, max_results=max_results)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we aren't actually returning the results in this line (just waiting for the query to finish), we don't need to pass max_results here. I believe that means we can remove the max_results parameter from _run_query as well.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed in f88ffe8

break
except futures.TimeoutError:
continue
Expand All @@ -320,6 +320,22 @@ def _run_query(client, query, job_config=None):
default=None,
help=("Project to use for executing this query. Defaults to the context project."),
)
@magic_arguments.argument(
"--max_results",
default=None,
help=(
"Maximum number of rows in dataframe returned from executing the query."
"Defaults to returning all rows."
),
)
@magic_arguments.argument(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like these arguments got doubled up. (Maybe the two commits thing, we noticed when rebasing?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's probably what happened (moral of the story, always run tests before pushing). Removed the duplicate in 147e43d

"--max_results",
default=None,
help=(
"Maximum number of rows in dataframe returned from executing the query."
"Defaults to returning all rows."
),
)
@magic_arguments.argument(
"--maximum_bytes_billed",
default=None,
Expand Down Expand Up @@ -420,6 +436,12 @@ def _cell_magic(line, query):
bqstorage_client = _make_bqstorage_client(
args.use_bqstorage_api or context.use_bqstorage_api, context.credentials
)

if args.max_results:
max_results = int(args.max_results)
else:
max_results = None

job_config = bigquery.job.QueryJobConfig()
job_config.query_parameters = params
job_config.use_legacy_sql = args.use_legacy_sql
Expand All @@ -433,7 +455,9 @@ def _cell_magic(line, query):

error = None
try:
query_job = _run_query(client, query, job_config)
query_job = _run_query(
client, query, job_config=job_config, max_results=max_results
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max_results is not actually needed here, it's not until the call to to_dataframe that we need max_results.

Note: to_dataframe probably doesn't have a max_results argument, and I'm actually not certain that we'd want to add one. Instead, we can call query_job.results with a max_results argument and then call to_dataframe on the resulting RowIterator.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I updated to call to_dataframe on query_job.result, passing max_results as an argument if max_results is present in f88ffe8

)
except Exception as ex:
error = str(ex)

Expand Down
65 changes: 61 additions & 4 deletions bigquery/tests/unit/test_magics.py
Original file line number Diff line number Diff line change
Expand Up @@ -414,7 +414,7 @@ def test_bigquery_magic_with_legacy_sql():
with run_query_patch as run_query_mock:
ip.run_cell_magic("bigquery", "--use_legacy_sql", "SELECT 17 AS num")

job_config_used = run_query_mock.call_args_list[0][0][-1]
job_config_used = run_query_mock.call_args_list[0][1]["job_config"]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this change intentional? Maybe a bad rebase?

Copy link
Contributor Author

@shubha-rajan shubha-rajan Sep 4, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was intentional. It was to fix some test failures I was getting because the argument that was being retrieved was the wrong type (a result of changing the method signature for _run_query). This way the arg being accessed will always be job_config, even if other named parameters are added. Since I reverted the changes to _run_query, I can probably change this back too. I think the tests will pass either way

assert job_config_used.use_legacy_sql is True


Expand Down Expand Up @@ -645,6 +645,57 @@ def test_bigquery_magic_without_bqstorage(monkeypatch):
assert isinstance(return_value, pandas.DataFrame)


@pytest.mark.usefixtures("ipython_interactive")
def test_bigquery_magic_w_max_results_invalid():
ip = IPython.get_ipython()
ip.extension_manager.load_extension("google.cloud.bigquery")
magics.context._project = None

credentials_mock = mock.create_autospec(
google.auth.credentials.Credentials, instance=True
)
default_patch = mock.patch(
"google.auth.default", return_value=(credentials_mock, "general-project")
)
client_query_patch = mock.patch(
"google.cloud.bigquery.client.Client.query", autospec=True
)

sql = "SELECT 17 AS num"

with pytest.raises(ValueError), default_patch, client_query_patch:
ip.run_cell_magic("bigquery", "--max_results=abc", sql)


@pytest.mark.usefixtures("ipython_interactive")
def test_bigquery_magic_w_max_results_valid_calls_queryjob_result():
ip = IPython.get_ipython()
ip.extension_manager.load_extension("google.cloud.bigquery")
magics.context._project = None

credentials_mock = mock.create_autospec(
google.auth.credentials.Credentials, instance=True
)
default_patch = mock.patch(
"google.auth.default", return_value=(credentials_mock, "general-project")
)
client_query_patch = mock.patch(
"google.cloud.bigquery.client.Client.query", autospec=True
)

sql = "SELECT 17 AS num"

query_job_mock = mock.create_autospec(
google.cloud.bigquery.job.QueryJob, instance=True
)

with client_query_patch as client_query_mock, default_patch:
client_query_mock.return_value = query_job_mock
ip.run_cell_magic("bigquery", "--max_results=5", sql)

query_job_mock.result.assert_called_once_with(timeout=0.5, max_results=5)


@pytest.mark.usefixtures("ipython_interactive")
def test_bigquery_magic_dryrun_option_sets_job_config():
ip = IPython.get_ipython()
Expand All @@ -662,7 +713,7 @@ def test_bigquery_magic_dryrun_option_sets_job_config():
with run_query_patch as run_query_mock:
ip.run_cell_magic("bigquery", "--dry_run", sql)

job_config_used = run_query_mock.call_args_list[0][0][-1]
job_config_used = run_query_mock.call_args_list[0][1]["job_config"]
assert job_config_used.dry_run is True


Expand Down Expand Up @@ -924,7 +975,10 @@ def test_bigquery_magic_with_string_params():
run_query_mock.return_value = query_job_mock

ip.run_cell_magic("bigquery", 'params_string_df --params {"num":17}', sql)
run_query_mock.assert_called_once_with(mock.ANY, sql.format(num=17), mock.ANY)

run_query_mock.assert_called_once_with(
mock.ANY, sql.format(num=17), mock.ANY, max_results=None
)

assert "params_string_df" in ip.user_ns # verify that the variable exists
df = ip.user_ns["params_string_df"]
Expand Down Expand Up @@ -959,7 +1013,10 @@ def test_bigquery_magic_with_dict_params():
# Insert dictionary into user namespace so that it can be expanded
ip.user_ns["params"] = params
ip.run_cell_magic("bigquery", "params_dict_df --params $params", sql)
run_query_mock.assert_called_once_with(mock.ANY, sql.format(num=17), mock.ANY)

run_query_mock.assert_called_once_with(
mock.ANY, sql.format(num=17), mock.ANY, max_results=None
)

assert "params_dict_df" in ip.user_ns # verify that the variable exists
df = ip.user_ns["params_dict_df"]
Expand Down