running flake8 on notebooks during CI + formatting (catherinedevlin#125)

* running nbqa during CI, formatting notebooks * linting in two steps
pmfischer · Feb 11, 2023 · 877a7ef · 877a7ef
1 parent 1e937d1
commit 877a7ef
Show file tree

Hide file tree

Showing 14 changed files with 101 additions and 46 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -21,9 +21,14 @@ jobs:
 
       - name: Lint with flake8
         run: |
+
           python -m pip install --upgrade pip
+          # run flake8 on .py files
           pip install flake8
           flake8
+          # run flake8 on notebooks (.ipynb, .md, etc)
+          pip install jupytext nbqa
+          nbqa flake8 .
 
       - name: Install dependencies
         run: |

diff --git a/doc/api/magic-plot.md b/doc/api/magic-plot.md
@@ -35,8 +35,10 @@ from pathlib import Path
 from urllib.request import urlretrieve
 
 if not Path("penguins.csv").is_file():
-    urlretrieve("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
-                "penguins.csv")
+    urlretrieve(
+        "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
+        "penguins.csv",
+    )
 ```
 
 ```{code-cell} ipython3
@@ -149,6 +151,6 @@ WHERE body_mass_g IS NOT NULL
 
 ```{code-cell} ipython3
 ax = %sqlplot histogram --table no-nulls --column body_mass_g --with no-nulls
-ax.set_title('Body mass (grams)')
+ax.set_title("Body mass (grams)")
 _ = ax.grid()
 ```
diff --git a/doc/api/magic-sql.md b/doc/api/magic-sql.md
@@ -183,11 +183,13 @@ LIMIT 3
 ```{code-cell} ipython3
 from string import Template
 
-template = Template("""
+template = Template(
+    """
 SELECT *
 FROM my_data
 LIMIT $limit
-""")
+"""
+)
 
 limit_one = template.substitute(limit=1)
 limit_two = template.substitute(limit=2)
@@ -241,11 +243,13 @@ result.csv(filename="my_data.csv")
 from pathlib import Path
 
 # generate sql file
-Path("my-query.sql").write_text("""
+Path("my-query.sql").write_text(
+    """
 SELECT *
 FROM my_data
 LIMIT 3
-""")
+"""
+)
 ```
 
 ```{code-cell} ipython3

diff --git a/doc/compose.md b/doc/compose.md
@@ -4,7 +4,7 @@ jupytext:
     extension: .md
     format_name: myst
     format_version: 0.13
-    jupytext_version: 1.14.0
+    jupytext_version: 1.14.4
 kernelspec:
   display_name: Python 3 (ipykernel)
   language: python
@@ -24,23 +24,25 @@ pip install jupysql matplotlib
 *New in version 0.4.3*
 
 ```{note}
-This is a beta feature, please [join our community](https://ploomber.io/community) and let us know how we can improve it!
+This is a beta feature, please [join our community](https://ploomber.io/community) and
+let us know how we can improve it!
 ```
 
-JupySQL allows you to break queries into multiple cells, simplifying the process of building large queries.
+JupySQL allows you to break queries into multiple cells, simplifying the process of
+building large queries.
 
-As an example, we are using a sales database from a record store. We'll find the artists that have produced the largest number of Rock and Metal songs.
+As an example, we are using a sales database from a record store. We'll find the
+artists that have produced the largest number of Rock and Metal songs.
 
 Let's load some data:
 
 ```{code-cell} ipython3
 import urllib.request
 from pathlib import Path
-from sqlite3 import connect
 
-if not Path('my.db').is_file():
-    url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"
-    urllib.request.urlretrieve(url, 'my.db')
+if not Path("my.db").is_file():
+    url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"  # noqa
+    urllib.request.urlretrieve(url, "my.db")
 ```
 
 Initialize the extension and set `autolimit=3` so we only retrieve a few rows.

diff --git a/doc/csv.md b/doc/csv.md
@@ -6,7 +6,7 @@ jupytext:
     extension: .md
     format_name: myst
     format_version: 0.13
-    jupytext_version: 1.14.0
+    jupytext_version: 1.14.4
 kernelspec:
   display_name: Python 3 (ipykernel)
   language: python
@@ -32,12 +32,12 @@ INSERT INTO writer VALUES ('Bertold', 'Brecht', 1956);
 
 ```{code-cell} ipython3
 result = %sql SELECT * FROM writer
-result.csv(filename='writer.csv')
+result.csv(filename="writer.csv")
 ```
 
 ```{code-cell} ipython3
 import pandas as pd
 
-df = pd.read_csv('writer.csv')
+df = pd.read_csv("writer.csv")
 df
 ```
diff --git a/doc/howto.md b/doc/howto.md
@@ -57,7 +57,10 @@ Download some sample data:
 ```{code-cell} ipython3
 from urllib.request import urlretrieve
 
-_ = urlretrieve("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv", "penguins.csv")
+_ = urlretrieve(
+    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
+    "penguins.csv",
+)
 ```
 
 ### Query
@@ -93,11 +96,14 @@ To register a user-defined function (UDF) when using SQLite, you can use [SQLAlc
 from sqlalchemy import create_engine
 from sqlalchemy import event
 
+
 def mysum(x, y):
     return x + y
 
+
 engine = create_engine("sqlite://")
 
+
 @event.listens_for(engine, "connect")
 def connect(conn, rec):
     conn.create_function(name="MYSUM", narg=2, func=mysum)

diff --git a/doc/howto/json.md b/doc/howto/json.md
@@ -49,10 +49,30 @@ from pathlib import Path
 import json
 
 data = [
-    {"name": "John", "age": 25,  "friends": ["Jake", "Kelly"], "likes": {"pizza": True, "tacos": True}},
-    {"name": "Jake", "age": 20,  "friends": ["John"], "likes": {"pizza": False, "tacos": True}},
-    {"name": "Kelly", "age": 21,  "friends": ["John", "Sam"], "likes": {"pizza": True, "tacos": True}},
-    {"name": "Sam", "age": 22,  "friends": ["Kelly"], "likes": {"pizza": False, "tacos": True}},
+    {
+        "name": "John",
+        "age": 25,
+        "friends": ["Jake", "Kelly"],
+        "likes": {"pizza": True, "tacos": True},
+    },
+    {
+        "name": "Jake",
+        "age": 20,
+        "friends": ["John"],
+        "likes": {"pizza": False, "tacos": True},
+    },
+    {
+        "name": "Kelly",
+        "age": 21,
+        "friends": ["John", "Sam"],
+        "likes": {"pizza": True, "tacos": True},
+    },
+    {
+        "name": "Sam",
+        "age": 22,
+        "friends": ["Kelly"],
+        "likes": {"pizza": False, "tacos": True},
+    },
 ]
 
 lines = ""

diff --git a/doc/howto/postgres-connect.ipynb b/doc/howto/postgres-connect.ipynb
@@ -146,7 +146,9 @@
    "source": [
     "import pandas as pd\n",
     "\n",
-    "df = pd.read_parquet(\"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet\")\n",
+    "df = pd.read_parquet(\n",
+    "    \"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet\"\n",
+    ")\n",
     "df.shape"
    ]
   },

diff --git a/doc/integrations/duckdb.md b/doc/integrations/duckdb.md
@@ -33,8 +33,10 @@ Get a sample `.csv.` file:
 ```{code-cell} ipython3
 from urllib.request import urlretrieve
 
-_ = urlretrieve("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
-                "penguins.csv")
+_ = urlretrieve(
+    "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
+    "penguins.csv",
+)
 ```
 
 ### Query
@@ -85,8 +87,10 @@ Download sample `.parquet` file:
 ```{code-cell} ipython3
 from urllib.request import urlretrieve
 
-_ = urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
-                "yellow_tripdata_2021-01.parquet")
+_ = urlretrieve(
+    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
+    "yellow_tripdata_2021-01.parquet",
+)
 ```
 
 ### Query
@@ -135,12 +139,11 @@ If you have a large SQlite database, you can use DuckDB to perform analytical qu
 ```{code-cell} ipython3
 import urllib.request
 from pathlib import Path
-from sqlite3 import connect
 
 # download sample database
-if not Path('my.db').is_file():
-    url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"
-    urllib.request.urlretrieve(url, 'my.db')
+if not Path("my.db").is_file():
+    url = "https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sqlite"  # noqa
+    urllib.request.urlretrieve(url, "my.db")
 ```
 
 We'll use `sqlite_scanner` extension to load a sample SQLite database into DuckDB:
@@ -177,10 +180,10 @@ N_MONTHS = 3
 
 # https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
 for i in range(1, N_MONTHS + 1):
-    filename = f'yellow_tripdata_2021-{str(i).zfill(2)}.parquet'
+    filename = f"yellow_tripdata_2021-{str(i).zfill(2)}.parquet"
     if not Path(filename).is_file():
-        print(f'Downloading: {filename}')
-        url = f'https://d37ci6vzurychx.cloudfront.net/trip-data/{filename}'
+        print(f"Downloading: {filename}")
+        url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{filename}"
         urllib.request.urlretrieve(url, filename)
 ```
 

diff --git a/doc/integrations/mindsdb.ipynb b/doc/integrations/mindsdb.ipynb
diff --git a/doc/intro.md b/doc/intro.md
@@ -128,7 +128,7 @@ Bind variables (bind parameters) can be used in the "named" (:x) style.
 The variable names used should be defined in the local namespace.
 
 ```{code-cell} ipython3
-name = 'Python'
+name = "Python"
 ```
 
 ```{code-cell} ipython3

diff --git a/doc/plot.md b/doc/plot.md
@@ -14,7 +14,8 @@ kernelspec:
 # Plotting
 
 ```{versionadded} 0.5.2
-`%sqlplot` was introduceed in 0.5.2; however, the underlying [Python API](api/python.html#sql-plot) was introduced in 0.4.4
+`%sqlplot` was introduceed in 0.5.2; however, the underlying
+[Python API](api/python.html#sql-plot) was introduced in 0.4.4
 ```
 
 
@@ -38,9 +39,10 @@ In this example, we'll demonstrate this second use case and query a `.parquet` f
 from pathlib import Path
 from urllib.request import urlretrieve
 
+url = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet"
+
 if not Path("yellow_tripdata_2021-01.parquet").is_file():
-    urlretrieve("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet",
-                "yellow_tripdata_2021-01.parquet")
+    urlretrieve(url, "yellow_tripdata_2021-01.parquet")
 ```
 
 ### Setup

diff --git a/doc/quick-start.md b/doc/quick-start.md
@@ -50,8 +50,10 @@ from pathlib import Path
 from urllib.request import urlretrieve
 
 if not Path("penguins.csv").is_file():
-    urlretrieve("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
-                "penguins.csv")
+    urlretrieve(
+        "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv",
+        "penguins.csv",
+    )
 ```
 
 Start a DuckDB in-memory database:
@@ -75,7 +77,6 @@ For short queries, you can write them in a single line via the `%sql` line magic
 
 For longer queries, you can break them down into multiple lines using the `%%sql` cell magic:
 
-
 ```{code-cell} ipython3
 %%sql
 SELECT *

diff --git a/pyproject.toml b/pyproject.toml
@@ -7,4 +7,13 @@ github = "ploomber/jupysql"
 
 [tool.pkgmt.check_links]
 extensions = ["md", "rst", "py", "ipynb"]
-ignore_substrings = ["d37ci6vzurychx.cloudfront.net"]
+ignore_substrings = ["d37ci6vzurychx.cloudfront.net"]
+
+[tool.nbqa.addopts]
+flake8 = [
+    # notebooks allow non-top imports
+    "--extend-ignore=E402",
+    # jupysql notebooks might have "undefined name" errors
+    # due to the << operator
+    "--ignore=F821",
+]