From 11f9a46cef7e5b534f409dfa7f996910eb148f1c Mon Sep 17 00:00:00 2001
From: Evan Rittenhouse <evanrittenhouse@gmail.com>
Date: Thu, 15 Jun 2023 12:14:39 -0500
Subject: [PATCH] Add CodeExtractor trait

---
 crates/ruff/src/jupyter/notebook.rs          | 274 ++++++++++---------
 crates/ruff/src/markdown/markdown_adapter.rs | 103 +++++++
 crates/ruff/src/markdown/mod.rs              |   3 +
 crates/ruff/src/source_kind.rs               |  11 +
 crates/ruff_cli/src/diagnostics.rs           |   4 +-
 5 files changed, 259 insertions(+), 136 deletions(-)
 create mode 100644 crates/ruff/src/markdown/markdown_adapter.rs
 create mode 100644 crates/ruff/src/markdown/mod.rs
diff --git a/crates/ruff/src/jupyter/notebook.rs b/crates/ruff/src/jupyter/notebook.rs
index d7381cf56f5969..25087dd4582376 100644
--- a/crates/ruff/src/jupyter/notebook.rs
+++ b/crates/ruff/src/jupyter/notebook.rs
@@ -17,6 +17,7 @@ use crate::autofix::source_map::{SourceMap, SourceMarker};
 use crate::jupyter::index::JupyterIndex;
 use crate::jupyter::{Cell, CellType, RawNotebook, SourceValue};
 use crate::rules::pycodestyle::rules::SyntaxError;
+use crate::source_kind::CodeExtractor;
 use crate::IOError;
 
 pub const JUPYTER_NOTEBOOK_EXT: &str = "ipynb";
@@ -25,7 +26,7 @@ const MAGIC_PREFIX: [&str; 3] = ["%", "!", "?"];
 
 /// Run round-trip source code generation on a given Jupyter notebook file path.
 pub fn round_trip(path: &Path) -> anyhow::Result<String> {
-    let mut notebook = Notebook::read(path).map_err(|err| {
+    let mut notebook = Notebook::extract_code(path).map_err(|err| {
         anyhow::anyhow!(
             "Failed to read notebook file `{}`: {:?}",
             path.display(),
@@ -97,133 +98,6 @@ pub struct Notebook {
 }
 
 impl Notebook {
-    /// See also the black implementation
-    /// <https://github.com/psf/black/blob/69ca0a4c7a365c5f5eea519a90980bab72cab764/src/black/__init__.py#L1017-L1046>
-    pub fn read(path: &Path) -> Result<Self, Box<Diagnostic>> {
-        let reader = BufReader::new(File::open(path).map_err(|err| {
-            Diagnostic::new(
-                IOError {
-                    message: format!("{err}"),
-                },
-                TextRange::default(),
-            )
-        })?);
-        let notebook: RawNotebook = match serde_json::from_reader(reader) {
-            Ok(notebook) => notebook,
-            Err(err) => {
-                // Translate the error into a diagnostic
-                return Err(Box::new({
-                    match err.classify() {
-                        Category::Io => Diagnostic::new(
-                            IOError {
-                                message: format!("{err}"),
-                            },
-                            TextRange::default(),
-                        ),
-                        Category::Syntax | Category::Eof => {
-                            // Maybe someone saved the python sources (those with the `# %%` separator)
-                            // as jupyter notebook instead. Let's help them.
-                            let contents = std::fs::read_to_string(path).map_err(|err| {
-                                Diagnostic::new(
-                                    IOError {
-                                        message: format!("{err}"),
-                                    },
-                                    TextRange::default(),
-                                )
-                            })?;
-                            // Check if tokenizing was successful and the file is non-empty
-                            if (ruff_rustpython::tokenize(&contents))
-                                .last()
-                                .map_or(true, Result::is_err)
-                            {
-                                Diagnostic::new(
-                                    SyntaxError {
-                                        message: format!(
-                                            "A Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT}) must internally be JSON, \
-                                but this file isn't valid JSON: {err}"
-                                        ),
-                                    },
-                                    TextRange::default(),
-                                )
-                            } else {
-                                Diagnostic::new(
-                                    SyntaxError {
-                                        message: format!(
-                                            "Expected a Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT} extension), \
-                                    which must be internally stored as JSON, \
-                                    but found a Python source file: {err}"
-                                        ),
-                                    },
-                                    TextRange::default(),
-                                )
-                            }
-                        }
-                        Category::Data => {
-                            // We could try to read the schema version here but if this fails it's
-                            // a bug anyway
-                            Diagnostic::new(
-                                SyntaxError {
-                                    message: format!(
-                                        "This file does not match the schema expected of Jupyter Notebooks: {err}"
-                                    ),
-                                },
-                                TextRange::default(),
-                            )
-                        }
-                    }
-                }));
-            }
-        };
-
-        // v4 is what everybody uses
-        if notebook.nbformat != 4 {
-            // bail because we should have already failed at the json schema stage
-            return Err(Box::new(Diagnostic::new(
-                SyntaxError {
-                    message: format!(
-                        "Expected Jupyter Notebook format 4, found {}",
-                        notebook.nbformat
-                    ),
-                },
-                TextRange::default(),
-            )));
-        }
-
-        let valid_code_cells = notebook
-            .cells
-            .iter()
-            .enumerate()
-            .filter(|(_, cell)| cell.is_valid_code_cell())
-            .map(|(pos, _)| u32::try_from(pos).unwrap())
-            .collect::<Vec<_>>();
-
-        let mut contents = Vec::with_capacity(valid_code_cells.len());
-        let mut current_offset = TextSize::from(0);
-        let mut cell_offsets = Vec::with_capacity(notebook.cells.len());
-        cell_offsets.push(TextSize::from(0));
-
-        for &pos in &valid_code_cells {
-            let cell_contents = match &notebook.cells[pos as usize].source {
-                SourceValue::String(string) => string.clone(),
-                SourceValue::StringArray(string_array) => string_array.join(""),
-            };
-            current_offset += TextSize::of(&cell_contents) + TextSize::new(1);
-            contents.push(cell_contents);
-            cell_offsets.push(current_offset);
-        }
-
-        Ok(Self {
-            raw: notebook,
-            index: OnceCell::new(),
-            // The additional newline at the end is to maintain consistency for
-            // all cells. These newlines will be removed before updating the
-            // source code with the transformed content. Refer `update_cell_content`.
-            content: contents.join("\n") + "\n",
-            cell_offsets,
-            valid_code_cells,
-        })
-    }
-
     /// Update the cell offsets as per the given [`SourceMap`].
     fn update_cell_offsets(&mut self, source_map: &SourceMap) {
         // When there are multiple cells without any edits, the offsets of those
@@ -402,6 +276,135 @@ impl Notebook {
     }
 }
 
+impl CodeExtractor<Notebook> for Notebook {
+    /// See also the black implementation
+    /// <https://github.com/psf/black/blob/69ca0a4c7a365c5f5eea519a90980bab72cab764/src/black/__init__.py#L1017-L1046>
+    fn extract_code(path: &Path) -> Result<Self, Box<Diagnostic>> {
+        let reader = BufReader::new(File::open(path).map_err(|err| {
+            Diagnostic::new(
+                IOError {
+                    message: format!("{err}"),
+                },
+                TextRange::default(),
+            )
+        })?);
+        let notebook: RawNotebook = match serde_json::from_reader(reader) {
+            Ok(notebook) => notebook,
+            Err(err) => {
+                // Translate the error into a diagnostic
+                return Err(Box::new({
+                    match err.classify() {
+                        Category::Io => Diagnostic::new(
+                            IOError {
+                                message: format!("{err}"),
+                            },
+                            TextRange::default(),
+                        ),
+                        Category::Syntax | Category::Eof => {
+                            // Maybe someone saved the python sources (those with the `# %%` separator)
+                            // as jupyter notebook instead. Let's help them.
+                            let contents = std::fs::read_to_string(path).map_err(|err| {
+                                Diagnostic::new(
+                                    IOError {
+                                        message: format!("{err}"),
+                                    },
+                                    TextRange::default(),
+                                )
+                            })?;
+                            // Check if tokenizing was successful and the file is non-empty
+                            if (ruff_rustpython::tokenize(&contents))
+                                .last()
+                                .map_or(true, Result::is_err)
+                            {
+                                Diagnostic::new(
+                                    SyntaxError {
+                                        message: format!(
+                                            "A Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT}) must internally be JSON, \
+                                but this file isn't valid JSON: {err}"
+                                        ),
+                                    },
+                                    TextRange::default(),
+                                )
+                            } else {
+                                Diagnostic::new(
+                                    SyntaxError {
+                                        message: format!(
+                                            "Expected a Jupyter Notebook (.{JUPYTER_NOTEBOOK_EXT} extension), \
+                                    which must be internally stored as JSON, \
+                                    but found a Python source file: {err}"
+                                        ),
+                                    },
+                                    TextRange::default(),
+                                )
+                            }
+                        }
+                        Category::Data => {
+                            // We could try to read the schema version here but if this fails it's
+                            // a bug anyway
+                            Diagnostic::new(
+                                SyntaxError {
+                                    message: format!(
+                                        "This file does not match the schema expected of Jupyter Notebooks: {err}"
+                                    ),
+                                },
+                                TextRange::default(),
+                            )
+                        }
+                    }
+                }));
+            }
+        };
+
+        // v4 is what everybody uses
+        if notebook.nbformat != 4 {
+            // bail because we should have already failed at the json schema stage
+            return Err(Box::new(Diagnostic::new(
+                SyntaxError {
+                    message: format!(
+                        "Expected Jupyter Notebook format 4, found {}",
+                        notebook.nbformat
+                    ),
+                },
+                TextRange::default(),
+            )));
+        }
+
+        let valid_code_cells = notebook
+            .cells
+            .iter()
+            .enumerate()
+            .filter(|(_, cell)| cell.is_valid_code_cell())
+            .map(|(pos, _)| u32::try_from(pos).unwrap())
+            .collect::<Vec<_>>();
+
+        let mut contents = Vec::with_capacity(valid_code_cells.len());
+        let mut current_offset = TextSize::from(0);
+        let mut cell_offsets = Vec::with_capacity(notebook.cells.len());
+        cell_offsets.push(TextSize::from(0));
+
+        for &pos in &valid_code_cells {
+            let cell_contents = match &notebook.cells[pos as usize].source {
+                SourceValue::String(string) => string.clone(),
+                SourceValue::StringArray(string_array) => string_array.join(""),
+            };
+            current_offset += TextSize::of(&cell_contents) + TextSize::new(1);
+            contents.push(cell_contents);
+            cell_offsets.push(current_offset);
+        }
+
+        Ok(Self {
+            raw: notebook,
+            index: OnceCell::new(),
+            // The additional newline at the end is to maintain consistency for
+            // all cells. These newlines will be removed before updating the
+            // source code with the transformed content. Refer `update_cell_content`.
+            content: contents.join("\n") + "\n",
+            cell_offsets,
+            valid_code_cells,
+        })
+    }
+}
+
 #[cfg(test)]
 mod test {
     use std::path::Path;
@@ -418,6 +421,9 @@ mod test {
     use crate::test::{test_notebook_path, test_resource_path};
     use crate::{assert_messages, settings};
 
+    use crate::source_kind::CodeExtractor;
+    use crate::test::test_resource_path;
+
     /// Read a Jupyter cell from the `resources/test/fixtures/jupyter/cell` directory.
     fn read_jupyter_cell(path: impl AsRef<Path>) -> Result<Cell> {
         let path = test_resource_path("fixtures/jupyter/cell").join(path);
@@ -428,21 +434,21 @@ mod test {
     #[test]
     fn test_valid() {
         let path = Path::new("resources/test/fixtures/jupyter/valid.ipynb");
-        assert!(Notebook::read(path).is_ok());
+        assert!(Notebook::extract_code(path).is_ok());
     }
 
     #[test]
     fn test_r() {
         // We can load this, it will be filtered out later
         let path = Path::new("resources/test/fixtures/jupyter/R.ipynb");
-        assert!(Notebook::read(path).is_ok());
+        assert!(Notebook::extract_code(path).is_ok());
     }
 
     #[test]
     fn test_invalid() {
         let path = Path::new("resources/test/fixtures/jupyter/invalid_extension.ipynb");
         assert_eq!(
-            Notebook::read(path).unwrap_err().kind.body,
+            Notebook::extract_code(path).unwrap_err().kind.body,
             "SyntaxError: Expected a Jupyter Notebook (.ipynb extension), \
             which must be internally stored as JSON, \
             but found a Python source file: \
@@ -450,14 +456,14 @@ mod test {
         );
         let path = Path::new("resources/test/fixtures/jupyter/not_json.ipynb");
         assert_eq!(
-            Notebook::read(path).unwrap_err().kind.body,
+            Notebook::extract_code(path).unwrap_err().kind.body,
             "SyntaxError: A Jupyter Notebook (.ipynb) must internally be JSON, \
             but this file isn't valid JSON: \
             expected value at line 1 column 1"
         );
         let path = Path::new("resources/test/fixtures/jupyter/wrong_schema.ipynb");
         assert_eq!(
-            Notebook::read(path).unwrap_err().kind.body,
+            Notebook::extract_code(path).unwrap_err().kind.body,
             "SyntaxError: This file does not match the schema expected of Jupyter Notebooks: \
             missing field `cells` at line 1 column 2"
         );
@@ -485,7 +491,7 @@ mod test {
     #[test]
     fn test_concat_notebook() {
         let path = Path::new("resources/test/fixtures/jupyter/valid.ipynb");
-        let notebook = Notebook::read(path).unwrap();
+        let notebook = Notebook::extract_code(path).unwrap();
         assert_eq!(
             notebook.content,
             r#"def unused_variable():
diff --git a/crates/ruff/src/markdown/markdown_adapter.rs b/crates/ruff/src/markdown/markdown_adapter.rs
new file mode 100644
index 00000000000000..f9d08bffe3378d
--- /dev/null
+++ b/crates/ruff/src/markdown/markdown_adapter.rs
@@ -0,0 +1,103 @@
+use pulldown_cmark::{CodeBlockKind, CowStr, Event, Parser, Tag};
+
+use crate::code_extraction::{BoundedCodeBlock, CodeBlocks, CodeExtractor};
+
+#[derive(Debug, Default)]
+pub struct MarkdownAdapter {}
+
+impl CodeExtractor for MarkdownAdapter {
+    fn extract_code<'a, 'input>(&'a self, input: &'input str) -> CodeBlocks {
+        let mut in_code_block = false;
+        let mut code_block: Option<BoundedCodeBlock> = None;
+        let mut code_blocks = CodeBlocks::new();
+
+        Parser::new(input)
+            .into_offset_iter()
+            .for_each(|(event, range)| match &event {
+                Event::Start(tag) => match tag {
+                    Tag::CodeBlock(block) => {
+                        if is_python_block(&block) {
+                            in_code_block = true;
+                        }
+                    }
+                    _ => (),
+                },
+                Event::Text(text) => {
+                    if in_code_block {
+                        code_block = Some(BoundedCodeBlock::from_range(text, range));
+                    }
+                }
+                Event::End(_) => {
+                    if let Some(bounded_code_block) = &code_block {
+                        code_blocks.add(bounded_code_block.clone());
+                        in_code_block = false;
+                        code_block = None;
+                    }
+                }
+                _ => (),
+            });
+
+        code_blocks
+    }
+}
+
+fn is_python_block(block: &CodeBlockKind) -> bool {
+    match block {
+        // TODO: detecting Python code from code blocks which aren't labeled as such?
+        CodeBlockKind::Fenced(b) => match b {
+            CowStr::Borrowed(language) => language.to_owned().to_lowercase() == "python",
+            _ => false,
+        },
+        CodeBlockKind::Indented => false,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use ruff_text_size::TextSize;
+
+    use crate::code_extraction::BoundedCodeBlock;
+
+    use super::*;
+
+    #[test]
+    fn test_get_code_blocks() {
+        let input = r#"
+# Heading 1
+Some more content
+
+```
+# fenced block with no language specified
+def foo():
+    return x
+```
+
+```python
+# fenced block with language specified
+def bar():
+    return y
+```
+
+and now let's make sure that a second block is picked up
+```python
+def barTwo():
+    return z
+```
+            "#;
+        let adapter = MarkdownAdapter::default();
+        let code_blocks = adapter.extract_code(input);
+        assert_eq!(
+            vec![
+                BoundedCodeBlock::new(
+                    "# fenced block with language specified\ndef bar():\n    return y\n".to_owned(),
+                    TextSize::new(117)
+                ),
+                BoundedCodeBlock::new(
+                    "def barTwo():\n    return z\n".to_owned(),
+                    TextSize::new(252)
+                )
+            ],
+            code_blocks.blocks
+        );
+    }
+}
diff --git a/crates/ruff/src/markdown/mod.rs b/crates/ruff/src/markdown/mod.rs
new file mode 100644
index 00000000000000..b4d28d80cd1bf0
--- /dev/null
+++ b/crates/ruff/src/markdown/mod.rs
@@ -0,0 +1,3 @@
+mod markdown_adapter;
+
+pub use markdown_adapter::*;
diff --git a/crates/ruff/src/source_kind.rs b/crates/ruff/src/source_kind.rs
index ab63e89c28f2ff..9874e27d82e301 100644
--- a/crates/ruff/src/source_kind.rs
+++ b/crates/ruff/src/source_kind.rs
@@ -1,3 +1,7 @@
+use std::path::Path;
+
+use ruff_diagnostics::Diagnostic;
+
 use crate::jupyter::Notebook;
 
 #[derive(Clone, Debug, PartialEq, is_macro::Is)]
@@ -24,3 +28,10 @@ impl SourceKind {
         }
     }
 }
+
+pub trait CodeExtractor<T>
+where
+    T: Sized,
+{
+    fn extract_code(path: &Path) -> Result<T, Box<Diagnostic>>;
+}
diff --git a/crates/ruff_cli/src/diagnostics.rs b/crates/ruff_cli/src/diagnostics.rs
index f8efe174c2da4b..451899059081ac 100644
--- a/crates/ruff_cli/src/diagnostics.rs
+++ b/crates/ruff_cli/src/diagnostics.rs
@@ -20,7 +20,7 @@ use ruff::logging::DisplayParseError;
 use ruff::message::Message;
 use ruff::pyproject_toml::lint_pyproject_toml;
 use ruff::settings::{flags, AllSettings, Settings};
-use ruff::source_kind::SourceKind;
+use ruff::source_kind::{CodeExtractor, SourceKind};
 use ruff_python_ast::imports::ImportMap;
 use ruff_python_ast::source_code::{LineIndex, SourceCode, SourceFileBuilder};
 use ruff_python_stdlib::path::is_project_toml;
@@ -67,7 +67,7 @@ impl AddAssign for Diagnostics {
 
 /// Returns either an indexed python jupyter notebook or a diagnostic (which is empty if we skip)
 fn load_jupyter_notebook(path: &Path) -> Result<Notebook, Box<Diagnostics>> {
-    let notebook = match Notebook::read(path) {
+    let notebook = match Notebook::extract_code(path) {
         Ok(notebook) => {
             if !notebook.is_python_notebook() {
                 // Not a python notebook, this could e.g. be an R notebook which we want to just skip