Skip to content

Commit

Permalink
Detect and ignore Jupyter automagics (#8398)
Browse files Browse the repository at this point in the history
## Summary

LangChain is attempting to use Ruff over their Jupyter notebooks
(https://github.com/langchain-ai/langchain/pull/12677/files), but
running into a bunch of syntax errors, the majority of which come from
our inability to recognize automagic.

If you run this in a cell:

```jupyter
pip install requests
```

Jupyter will automatically treat that as:

```jupyter
%pip install requests
```

We need to ignore cells that use these automagics, since the parser
doesn't understand them. (I guess we could support it in the parser, but
that seems much harder?). The good news is that AFAICT Jupyter doesn't
let you mix automagics with code, so by skipping these cells, we don't
miss out on analyzing any Python code.

## Test Plan

1. `cargo test`
2. Ran over LangChain and verified that there are no more errors
relating to `pip install` automagics.
  • Loading branch information
charliermarsh authored Nov 3, 2023
1 parent 2ff1afb commit f64c389
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 6 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["pip install requests"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["x = 1\n", "pip install requests"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["pip install requests\n", "x = 1"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"execution_count": null,
"cell_type": "code",
"id": "1",
"metadata": {},
"outputs": [],
"source": ["pip install requests\n", "pip install requests"]
}
128 changes: 122 additions & 6 deletions crates/ruff_notebook/src/notebook.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,13 +80,125 @@ impl Cell {
// Ignore cells containing cell magic as they act on the entire cell
// as compared to line magic which acts on a single line.
!match source {
SourceValue::String(string) => string
.lines()
.any(|line| line.trim_start().starts_with("%%")),
SourceValue::StringArray(string_array) => string_array
.iter()
.any(|line| line.trim_start().starts_with("%%")),
SourceValue::String(string) => Self::is_magic_cell(string.lines()),
SourceValue::StringArray(string_array) => {
Self::is_magic_cell(string_array.iter().map(String::as_str))
}
}
}

/// Returns `true` if a cell should be ignored due to the use of cell magics.
fn is_magic_cell<'a>(lines: impl Iterator<Item = &'a str>) -> bool {
let mut lines = lines.peekable();

// Detect automatic line magics (automagic), which aren't supported by the parser. If a line
// magic uses automagic, Jupyter doesn't allow following it with non-magic lines anyway, so
// we aren't missing out on any valid Python code.
//
// For example, this is valid:
// ```jupyter
// cat /path/to/file
// cat /path/to/file
// ```
//
// But this is invalid:
// ```jupyter
// cat /path/to/file
// x = 1
// ```
//
// See: https://ipython.readthedocs.io/en/stable/interactive/magics.html
if lines
.peek()
.and_then(|line| line.split_whitespace().next())
.is_some_and(|token| {
matches!(
token,
"alias"
| "alias_magic"
| "autoawait"
| "autocall"
| "automagic"
| "bookmark"
| "cd"
| "code_wrap"
| "colors"
| "conda"
| "config"
| "debug"
| "dhist"
| "dirs"
| "doctest_mode"
| "edit"
| "env"
| "gui"
| "history"
| "killbgscripts"
| "load"
| "load_ext"
| "loadpy"
| "logoff"
| "logon"
| "logstart"
| "logstate"
| "logstop"
| "lsmagic"
| "macro"
| "magic"
| "mamba"
| "matplotlib"
| "micromamba"
| "notebook"
| "page"
| "pastebin"
| "pdb"
| "pdef"
| "pdoc"
| "pfile"
| "pinfo"
| "pinfo2"
| "pip"
| "popd"
| "pprint"
| "precision"
| "prun"
| "psearch"
| "psource"
| "pushd"
| "pwd"
| "pycat"
| "pylab"
| "quickref"
| "recall"
| "rehashx"
| "reload_ext"
| "rerun"
| "reset"
| "reset_selective"
| "run"
| "save"
| "sc"
| "set_env"
| "sx"
| "system"
| "tb"
| "time"
| "timeit"
| "unalias"
| "unload_ext"
| "who"
| "who_ls"
| "whos"
| "xdel"
| "xmode"
)
})
{
return true;
}

// Detect cell magics (which operate on multiple lines).
lines.any(|line| line.trim_start().starts_with("%%"))
}
}

Expand Down Expand Up @@ -481,6 +593,10 @@ mod tests {
#[test_case(Path::new("code_and_magic.json"), true; "code_and_magic")]
#[test_case(Path::new("only_code.json"), true; "only_code")]
#[test_case(Path::new("cell_magic.json"), false; "cell_magic")]
#[test_case(Path::new("automagic.json"), false; "automagic")]
#[test_case(Path::new("automagics.json"), false; "automagics")]
#[test_case(Path::new("automagic_before_code.json"), false; "automagic_before_code")]
#[test_case(Path::new("automagic_after_code.json"), true; "automagic_after_code")]
fn test_is_valid_code_cell(path: &Path, expected: bool) -> Result<()> {
/// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory.
fn read_jupyter_cell(path: impl AsRef<Path>) -> Result<Cell> {
Expand Down

0 comments on commit f64c389

Please sign in to comment.