From f64c38965451b9a8fd1088f30c321e0f4174645e Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Thu, 2 Nov 2023 18:14:10 -0700 Subject: [PATCH] Detect and ignore Jupyter automagics (#8398) ## Summary LangChain is attempting to use Ruff over their Jupyter notebooks (https://github.com/langchain-ai/langchain/pull/12677/files), but running into a bunch of syntax errors, the majority of which come from our inability to recognize automagic. If you run this in a cell: ```jupyter pip install requests ``` Jupyter will automatically treat that as: ```jupyter %pip install requests ``` We need to ignore cells that use these automagics, since the parser doesn't understand them. (I guess we could support it in the parser, but that seems much harder?). The good news is that AFAICT Jupyter doesn't let you mix automagics with code, so by skipping these cells, we don't miss out on analyzing any Python code. ## Test Plan 1. `cargo test` 2. Ran over LangChain and verified that there are no more errors relating to `pip install` automagics. --- .../test/fixtures/jupyter/cell/automagic.json | 8 ++ .../jupyter/cell/automagic_after_code.json | 8 ++ .../jupyter/cell/automagic_before_code.json | 8 ++ .../fixtures/jupyter/cell/automagics.json | 8 ++ crates/ruff_notebook/src/notebook.rs | 128 +++++++++++++++++- 5 files changed, 154 insertions(+), 6 deletions(-) create mode 100644 crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic.json create mode 100644 crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_after_code.json create mode 100644 crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_before_code.json create mode 100644 crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagics.json diff --git a/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic.json b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic.json new file mode 100644 index 0000000000000..5cb72064ec1c0 --- /dev/null +++ b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic.json @@ -0,0 +1,8 @@ +{ + "execution_count": null, + "cell_type": "code", + "id": "1", + "metadata": {}, + "outputs": [], + "source": ["pip install requests"] +} diff --git a/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_after_code.json b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_after_code.json new file mode 100644 index 0000000000000..619a4f7125e4e --- /dev/null +++ b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_after_code.json @@ -0,0 +1,8 @@ +{ + "execution_count": null, + "cell_type": "code", + "id": "1", + "metadata": {}, + "outputs": [], + "source": ["x = 1\n", "pip install requests"] +} diff --git a/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_before_code.json b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_before_code.json new file mode 100644 index 0000000000000..10255fca0e9b3 --- /dev/null +++ b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagic_before_code.json @@ -0,0 +1,8 @@ +{ + "execution_count": null, + "cell_type": "code", + "id": "1", + "metadata": {}, + "outputs": [], + "source": ["pip install requests\n", "x = 1"] +} diff --git a/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagics.json b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagics.json new file mode 100644 index 0000000000000..8bd20103af602 --- /dev/null +++ b/crates/ruff_notebook/resources/test/fixtures/jupyter/cell/automagics.json @@ -0,0 +1,8 @@ +{ + "execution_count": null, + "cell_type": "code", + "id": "1", + "metadata": {}, + "outputs": [], + "source": ["pip install requests\n", "pip install requests"] +} diff --git a/crates/ruff_notebook/src/notebook.rs b/crates/ruff_notebook/src/notebook.rs index 611b90cbee2e1..61994e7702648 100644 --- a/crates/ruff_notebook/src/notebook.rs +++ b/crates/ruff_notebook/src/notebook.rs @@ -80,13 +80,125 @@ impl Cell { // Ignore cells containing cell magic as they act on the entire cell // as compared to line magic which acts on a single line. !match source { - SourceValue::String(string) => string - .lines() - .any(|line| line.trim_start().starts_with("%%")), - SourceValue::StringArray(string_array) => string_array - .iter() - .any(|line| line.trim_start().starts_with("%%")), + SourceValue::String(string) => Self::is_magic_cell(string.lines()), + SourceValue::StringArray(string_array) => { + Self::is_magic_cell(string_array.iter().map(String::as_str)) + } + } + } + + /// Returns `true` if a cell should be ignored due to the use of cell magics. + fn is_magic_cell<'a>(lines: impl Iterator) -> bool { + let mut lines = lines.peekable(); + + // Detect automatic line magics (automagic), which aren't supported by the parser. If a line + // magic uses automagic, Jupyter doesn't allow following it with non-magic lines anyway, so + // we aren't missing out on any valid Python code. + // + // For example, this is valid: + // ```jupyter + // cat /path/to/file + // cat /path/to/file + // ``` + // + // But this is invalid: + // ```jupyter + // cat /path/to/file + // x = 1 + // ``` + // + // See: https://ipython.readthedocs.io/en/stable/interactive/magics.html + if lines + .peek() + .and_then(|line| line.split_whitespace().next()) + .is_some_and(|token| { + matches!( + token, + "alias" + | "alias_magic" + | "autoawait" + | "autocall" + | "automagic" + | "bookmark" + | "cd" + | "code_wrap" + | "colors" + | "conda" + | "config" + | "debug" + | "dhist" + | "dirs" + | "doctest_mode" + | "edit" + | "env" + | "gui" + | "history" + | "killbgscripts" + | "load" + | "load_ext" + | "loadpy" + | "logoff" + | "logon" + | "logstart" + | "logstate" + | "logstop" + | "lsmagic" + | "macro" + | "magic" + | "mamba" + | "matplotlib" + | "micromamba" + | "notebook" + | "page" + | "pastebin" + | "pdb" + | "pdef" + | "pdoc" + | "pfile" + | "pinfo" + | "pinfo2" + | "pip" + | "popd" + | "pprint" + | "precision" + | "prun" + | "psearch" + | "psource" + | "pushd" + | "pwd" + | "pycat" + | "pylab" + | "quickref" + | "recall" + | "rehashx" + | "reload_ext" + | "rerun" + | "reset" + | "reset_selective" + | "run" + | "save" + | "sc" + | "set_env" + | "sx" + | "system" + | "tb" + | "time" + | "timeit" + | "unalias" + | "unload_ext" + | "who" + | "who_ls" + | "whos" + | "xdel" + | "xmode" + ) + }) + { + return true; } + + // Detect cell magics (which operate on multiple lines). + lines.any(|line| line.trim_start().starts_with("%%")) } } @@ -481,6 +593,10 @@ mod tests { #[test_case(Path::new("code_and_magic.json"), true; "code_and_magic")] #[test_case(Path::new("only_code.json"), true; "only_code")] #[test_case(Path::new("cell_magic.json"), false; "cell_magic")] + #[test_case(Path::new("automagic.json"), false; "automagic")] + #[test_case(Path::new("automagics.json"), false; "automagics")] + #[test_case(Path::new("automagic_before_code.json"), false; "automagic_before_code")] + #[test_case(Path::new("automagic_after_code.json"), true; "automagic_after_code")] fn test_is_valid_code_cell(path: &Path, expected: bool) -> Result<()> { /// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory. fn read_jupyter_cell(path: impl AsRef) -> Result {