From 11f9a46cef7e5b534f409dfa7f996910eb148f1c Mon Sep 17 00:00:00 2001 From: Evan Rittenhouse Date: Thu, 15 Jun 2023 12:14:39 -0500 Subject: [PATCH] Add CodeExtractor trait --- crates/ruff/src/jupyter/notebook.rs | 274 ++++++++++--------- crates/ruff/src/markdown/markdown_adapter.rs | 103 +++++++ crates/ruff/src/markdown/mod.rs | 3 + crates/ruff/src/source_kind.rs | 11 + crates/ruff_cli/src/diagnostics.rs | 4 +- 5 files changed, 259 insertions(+), 136 deletions(-) create mode 100644 crates/ruff/src/markdown/markdown_adapter.rs create mode 100644 crates/ruff/src/markdown/mod.rs diff --git a/crates/ruff/src/jupyter/notebook.rs b/crates/ruff/src/jupyter/notebook.rs index d7381cf56f5969..25087dd4582376 100644 --- a/crates/ruff/src/jupyter/notebook.rs +++ b/crates/ruff/src/jupyter/notebook.rs @@ -17,6 +17,7 @@ use crate::autofix::source_map::{SourceMap, SourceMarker}; use crate::jupyter::index::JupyterIndex; use crate::jupyter::{Cell, CellType, RawNotebook, SourceValue}; use crate::rules::pycodestyle::rules::SyntaxError; +use crate::source_kind::CodeExtractor; use crate::IOError; pub const JUPYTER_NOTEBOOK_EXT: &str = "ipynb"; @@ -25,7 +26,7 @@ const MAGIC_PREFIX: [&str; 3] = ["%", "!", "?"]; /// Run round-trip source code generation on a given Jupyter notebook file path. pub fn round_trip(path: &Path) -> anyhow::Result { - let mut notebook = Notebook::read(path).map_err(|err| { + let mut notebook = Notebook::extract_code(path).map_err(|err| { anyhow::anyhow!( "Failed to read notebook file `{}`: {:?}", path.display(), @@ -97,133 +98,6 @@ pub struct Notebook { } impl Notebook { - /// See also the black implementation - /// - pub fn read(path: &Path) -> Result> { - let reader = BufReader::new(File::open(path).map_err(|err| { - Diagnostic::new( - IOError { - message: format!("{err}"), - }, - TextRange::default(), - ) - })?); - let notebook: RawNotebook = match serde_json::from_reader(reader) { - Ok(notebook) => notebook, - Err(err) => { - // Translate the error into a diagnostic - return Err(Box::new({ - match err.classify() { - Category::Io => Diagnostic::new( - IOError { - message: format!("{err}"), - }, - TextRange::default(), - ), - Category::Syntax | Category::Eof => { - // Maybe someone saved the python sources (those with the `# %%` separator) - // as jupyter notebook instead. Let's help them. - let contents = std::fs::read_to_string(path).map_err(|err| { - Diagnostic::new( - IOError { - message: format!("{err}"), - }, - TextRange::default(), - ) - })?; - // Check if tokenizing was successful and the file is non-empty - if (ruff_rustpython::tokenize(&contents)) - .last() - .map_or(true, Result::is_err) - { - Diagnostic::new( - SyntaxError { - message: format!( - "A Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT}) must internally be JSON, \ - but this file isn't valid JSON: {err}" - ), - }, - TextRange::default(), - ) - } else { - Diagnostic::new( - SyntaxError { - message: format!( - "Expected a Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT} extension), \ - which must be internally stored as JSON, \ - but found a Python source file: {err}" - ), - }, - TextRange::default(), - ) - } - } - Category::Data => { - // We could try to read the schema version here but if this fails it's - // a bug anyway - Diagnostic::new( - SyntaxError { - message: format!( - "This file does not match the schema expected of Jupyter Notebooks: {err}" - ), - }, - TextRange::default(), - ) - } - } - })); - } - }; - - // v4 is what everybody uses - if notebook.nbformat != 4 { - // bail because we should have already failed at the json schema stage - return Err(Box::new(Diagnostic::new( - SyntaxError { - message: format!( - "Expected Jupyter Notebook format 4, found {}", - notebook.nbformat - ), - }, - TextRange::default(), - ))); - } - - let valid_code_cells = notebook - .cells - .iter() - .enumerate() - .filter(|(_, cell)| cell.is_valid_code_cell()) - .map(|(pos, _)| u32::try_from(pos).unwrap()) - .collect::>(); - - let mut contents = Vec::with_capacity(valid_code_cells.len()); - let mut current_offset = TextSize::from(0); - let mut cell_offsets = Vec::with_capacity(notebook.cells.len()); - cell_offsets.push(TextSize::from(0)); - - for &pos in &valid_code_cells { - let cell_contents = match ¬ebook.cells[pos as usize].source { - SourceValue::String(string) => string.clone(), - SourceValue::StringArray(string_array) => string_array.join(""), - }; - current_offset += TextSize::of(&cell_contents) + TextSize::new(1); - contents.push(cell_contents); - cell_offsets.push(current_offset); - } - - Ok(Self { - raw: notebook, - index: OnceCell::new(), - // The additional newline at the end is to maintain consistency for - // all cells. These newlines will be removed before updating the - // source code with the transformed content. Refer `update_cell_content`. - content: contents.join("\n") + "\n", - cell_offsets, - valid_code_cells, - }) - } - /// Update the cell offsets as per the given [`SourceMap`]. fn update_cell_offsets(&mut self, source_map: &SourceMap) { // When there are multiple cells without any edits, the offsets of those @@ -402,6 +276,135 @@ impl Notebook { } } +impl CodeExtractor for Notebook { + /// See also the black implementation + /// + fn extract_code(path: &Path) -> Result> { + let reader = BufReader::new(File::open(path).map_err(|err| { + Diagnostic::new( + IOError { + message: format!("{err}"), + }, + TextRange::default(), + ) + })?); + let notebook: RawNotebook = match serde_json::from_reader(reader) { + Ok(notebook) => notebook, + Err(err) => { + // Translate the error into a diagnostic + return Err(Box::new({ + match err.classify() { + Category::Io => Diagnostic::new( + IOError { + message: format!("{err}"), + }, + TextRange::default(), + ), + Category::Syntax | Category::Eof => { + // Maybe someone saved the python sources (those with the `# %%` separator) + // as jupyter notebook instead. Let's help them. + let contents = std::fs::read_to_string(path).map_err(|err| { + Diagnostic::new( + IOError { + message: format!("{err}"), + }, + TextRange::default(), + ) + })?; + // Check if tokenizing was successful and the file is non-empty + if (ruff_rustpython::tokenize(&contents)) + .last() + .map_or(true, Result::is_err) + { + Diagnostic::new( + SyntaxError { + message: format!( + "A Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT}) must internally be JSON, \ + but this file isn't valid JSON: {err}" + ), + }, + TextRange::default(), + ) + } else { + Diagnostic::new( + SyntaxError { + message: format!( + "Expected a Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT} extension), \ + which must be internally stored as JSON, \ + but found a Python source file: {err}" + ), + }, + TextRange::default(), + ) + } + } + Category::Data => { + // We could try to read the schema version here but if this fails it's + // a bug anyway + Diagnostic::new( + SyntaxError { + message: format!( + "This file does not match the schema expected of Jupyter Notebooks: {err}" + ), + }, + TextRange::default(), + ) + } + } + })); + } + }; + + // v4 is what everybody uses + if notebook.nbformat != 4 { + // bail because we should have already failed at the json schema stage + return Err(Box::new(Diagnostic::new( + SyntaxError { + message: format!( + "Expected Jupyter Notebook format 4, found {}", + notebook.nbformat + ), + }, + TextRange::default(), + ))); + } + + let valid_code_cells = notebook + .cells + .iter() + .enumerate() + .filter(|(_, cell)| cell.is_valid_code_cell()) + .map(|(pos, _)| u32::try_from(pos).unwrap()) + .collect::>(); + + let mut contents = Vec::with_capacity(valid_code_cells.len()); + let mut current_offset = TextSize::from(0); + let mut cell_offsets = Vec::with_capacity(notebook.cells.len()); + cell_offsets.push(TextSize::from(0)); + + for &pos in &valid_code_cells { + let cell_contents = match ¬ebook.cells[pos as usize].source { + SourceValue::String(string) => string.clone(), + SourceValue::StringArray(string_array) => string_array.join(""), + }; + current_offset += TextSize::of(&cell_contents) + TextSize::new(1); + contents.push(cell_contents); + cell_offsets.push(current_offset); + } + + Ok(Self { + raw: notebook, + index: OnceCell::new(), + // The additional newline at the end is to maintain consistency for + // all cells. These newlines will be removed before updating the + // source code with the transformed content. Refer `update_cell_content`. + content: contents.join("\n") + "\n", + cell_offsets, + valid_code_cells, + }) + } +} + #[cfg(test)] mod test { use std::path::Path; @@ -418,6 +421,9 @@ mod test { use crate::test::{test_notebook_path, test_resource_path}; use crate::{assert_messages, settings}; + use crate::source_kind::CodeExtractor; + use crate::test::test_resource_path; + /// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory. fn read_jupyter_cell(path: impl AsRef) -> Result { let path = test_resource_path("fixtures/jupyter/cell").join(path); @@ -428,21 +434,21 @@ mod test { #[test] fn test_valid() { let path = Path::new("resources/test/fixtures/jupyter/valid.ipynb"); - assert!(Notebook::read(path).is_ok()); + assert!(Notebook::extract_code(path).is_ok()); } #[test] fn test_r() { // We can load this, it will be filtered out later let path = Path::new("resources/test/fixtures/jupyter/R.ipynb"); - assert!(Notebook::read(path).is_ok()); + assert!(Notebook::extract_code(path).is_ok()); } #[test] fn test_invalid() { let path = Path::new("resources/test/fixtures/jupyter/invalid_extension.ipynb"); assert_eq!( - Notebook::read(path).unwrap_err().kind.body, + Notebook::extract_code(path).unwrap_err().kind.body, "SyntaxError: Expected a Jupyter Notebook (.ipynb extension), \ which must be internally stored as JSON, \ but found a Python source file: \ @@ -450,14 +456,14 @@ mod test { ); let path = Path::new("resources/test/fixtures/jupyter/not_json.ipynb"); assert_eq!( - Notebook::read(path).unwrap_err().kind.body, + Notebook::extract_code(path).unwrap_err().kind.body, "SyntaxError: A Jupyter Notebook (.ipynb) must internally be JSON, \ but this file isn't valid JSON: \ expected value at line 1 column 1" ); let path = Path::new("resources/test/fixtures/jupyter/wrong_schema.ipynb"); assert_eq!( - Notebook::read(path).unwrap_err().kind.body, + Notebook::extract_code(path).unwrap_err().kind.body, "SyntaxError: This file does not match the schema expected of Jupyter Notebooks: \ missing field `cells` at line 1 column 2" ); @@ -485,7 +491,7 @@ mod test { #[test] fn test_concat_notebook() { let path = Path::new("resources/test/fixtures/jupyter/valid.ipynb"); - let notebook = Notebook::read(path).unwrap(); + let notebook = Notebook::extract_code(path).unwrap(); assert_eq!( notebook.content, r#"def unused_variable(): diff --git a/crates/ruff/src/markdown/markdown_adapter.rs b/crates/ruff/src/markdown/markdown_adapter.rs new file mode 100644 index 00000000000000..f9d08bffe3378d --- /dev/null +++ b/crates/ruff/src/markdown/markdown_adapter.rs @@ -0,0 +1,103 @@ +use pulldown_cmark::{CodeBlockKind, CowStr, Event, Parser, Tag}; + +use crate::code_extraction::{BoundedCodeBlock, CodeBlocks, CodeExtractor}; + +#[derive(Debug, Default)] +pub struct MarkdownAdapter {} + +impl CodeExtractor for MarkdownAdapter { + fn extract_code<'a, 'input>(&'a self, input: &'input str) -> CodeBlocks { + let mut in_code_block = false; + let mut code_block: Option = None; + let mut code_blocks = CodeBlocks::new(); + + Parser::new(input) + .into_offset_iter() + .for_each(|(event, range)| match &event { + Event::Start(tag) => match tag { + Tag::CodeBlock(block) => { + if is_python_block(&block) { + in_code_block = true; + } + } + _ => (), + }, + Event::Text(text) => { + if in_code_block { + code_block = Some(BoundedCodeBlock::from_range(text, range)); + } + } + Event::End(_) => { + if let Some(bounded_code_block) = &code_block { + code_blocks.add(bounded_code_block.clone()); + in_code_block = false; + code_block = None; + } + } + _ => (), + }); + + code_blocks + } +} + +fn is_python_block(block: &CodeBlockKind) -> bool { + match block { + // TODO: detecting Python code from code blocks which aren't labeled as such? + CodeBlockKind::Fenced(b) => match b { + CowStr::Borrowed(language) => language.to_owned().to_lowercase() == "python", + _ => false, + }, + CodeBlockKind::Indented => false, + } +} + +#[cfg(test)] +mod tests { + use ruff_text_size::TextSize; + + use crate::code_extraction::BoundedCodeBlock; + + use super::*; + + #[test] + fn test_get_code_blocks() { + let input = r#" +# Heading 1 +Some more content + +``` +# fenced block with no language specified +def foo(): + return x +``` + +```python +# fenced block with language specified +def bar(): + return y +``` + +and now let's make sure that a second block is picked up +```python +def barTwo(): + return z +``` + "#; + let adapter = MarkdownAdapter::default(); + let code_blocks = adapter.extract_code(input); + assert_eq!( + vec![ + BoundedCodeBlock::new( + "# fenced block with language specified\ndef bar():\n return y\n".to_owned(), + TextSize::new(117) + ), + BoundedCodeBlock::new( + "def barTwo():\n return z\n".to_owned(), + TextSize::new(252) + ) + ], + code_blocks.blocks + ); + } +} diff --git a/crates/ruff/src/markdown/mod.rs b/crates/ruff/src/markdown/mod.rs new file mode 100644 index 00000000000000..b4d28d80cd1bf0 --- /dev/null +++ b/crates/ruff/src/markdown/mod.rs @@ -0,0 +1,3 @@ +mod markdown_adapter; + +pub use markdown_adapter::*; diff --git a/crates/ruff/src/source_kind.rs b/crates/ruff/src/source_kind.rs index ab63e89c28f2ff..9874e27d82e301 100644 --- a/crates/ruff/src/source_kind.rs +++ b/crates/ruff/src/source_kind.rs @@ -1,3 +1,7 @@ +use std::path::Path; + +use ruff_diagnostics::Diagnostic; + use crate::jupyter::Notebook; #[derive(Clone, Debug, PartialEq, is_macro::Is)] @@ -24,3 +28,10 @@ impl SourceKind { } } } + +pub trait CodeExtractor +where + T: Sized, +{ + fn extract_code(path: &Path) -> Result>; +} diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs index f8efe174c2da4b..451899059081ac 100644 --- a/crates/ruff_cli/src/diagnostics.rs +++ b/crates/ruff_cli/src/diagnostics.rs @@ -20,7 +20,7 @@ use ruff::logging::DisplayParseError; use ruff::message::Message; use ruff::pyproject_toml::lint_pyproject_toml; use ruff::settings::{flags, AllSettings, Settings}; -use ruff::source_kind::SourceKind; +use ruff::source_kind::{CodeExtractor, SourceKind}; use ruff_python_ast::imports::ImportMap; use ruff_python_ast::source_code::{LineIndex, SourceCode, SourceFileBuilder}; use ruff_python_stdlib::path::is_project_toml; @@ -67,7 +67,7 @@ impl AddAssign for Diagnostics { /// Returns either an indexed python jupyter notebook or a diagnostic (which is empty if we skip) fn load_jupyter_notebook(path: &Path) -> Result> { - let notebook = match Notebook::read(path) { + let notebook = match Notebook::extract_code(path) { Ok(notebook) => { if !notebook.is_python_notebook() { // Not a python notebook, this could e.g. be an R notebook which we want to just skip