Implements Python import statement parser to avoid ast module.

cloudflare · Sep 13, 2024 · dab3faa · dab3faa
1 parent 6a236a1
commit dab3faa
Show file tree

Hide file tree

Showing 7 changed files with 346 additions and 10 deletions.
diff --git a/src/pyodide/internal/snapshot.ts b/src/pyodide/internal/snapshot.ts
@@ -13,6 +13,7 @@ import {
   MEMORY_SNAPSHOT_READER,
 } from 'pyodide-internal:metadata';
 import { reportError, simpleRunPython } from 'pyodide-internal:util';
+import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata';
 
 let LOADED_BASELINE_SNAPSHOT: number;
 
@@ -271,14 +272,20 @@ function memorySnapshotDoImports(Module: Module): Array<string> {
   );
   simpleRunPython(Module, processScriptImportsString);
 
-  const importedModules: Array<string> = JSON.parse(
-    simpleRunPython(
-      Module,
-      'import sys, json; print(json.dumps(CF_LOADED_MODULES), file=sys.stderr)'
-    )
-  );
-
-  return importedModules;
+  // The `importedModules` list will contain all modules that have been imported, including local
+  // modules, the usual `js` and other stdlib modules. We want to filter out local imports, so we
+  // grab them and put them into a set for fast filtering.
+  const localModulePaths: Set<string> = new Set<string>(MetadataReader.getNames());
+  // @ts-ignore parsePythonScriptImports is a static method.
+  const importedModules: Array<string> = ArtifactBundler.constructor.parsePythonScriptImports(
+    MetadataReader.getWorkerFiles("py")
+  ).filter((module: string) => {
+    const moduleFilename = module.replace(".", "/") + ".py";
+    return !localModulePaths.has(moduleFilename) && module != "js"
+  });
+
+  // Deduplicate the modules.
+  return [...new Set(importedModules)];
 }
 
 function checkLoadedSoFiles(dsoJSON: DylinkInfo): void {

diff --git a/src/pyodide/types/FS.d.ts b/src/pyodide/types/FS.d.ts
@@ -29,6 +29,7 @@ interface FS {
     mode: number
   ): FSNode<Info>;
   isFile: (mode: number) => boolean;
+  readdir: (path: string) => Array<string>;
   genericErrors: Error[];
 }
 

diff --git a/src/pyodide/types/runtime-generated/metadata.d.ts b/src/pyodide/types/runtime-generated/metadata.d.ts
@@ -7,6 +7,7 @@ declare namespace MetadataReader {
   const getMainModule: () => string;
   const hasMemorySnapshot: () => boolean;
   const getNames: () => string[];
+  const getWorkerFiles: (ext: string) => string[];
   const getSizes: () => number[];
   const readMemorySnapshot: (
     offset: number,

diff --git a/src/workerd/api/BUILD.bazel b/src/workerd/api/BUILD.bazel
@@ -112,6 +112,14 @@ wd_cc_library(
     ],
 )
 
+kj_test(
+    src = "pyodide/pyodide-test.c++",
+    deps = [
+        ":pyodide",
+        "//src/workerd/tests:test-fixture",
+    ],
+)
+
 wd_cc_library(
     name = "data-url",
     srcs = ["data-url.c++"],

diff --git a/src/workerd/api/pyodide/pyodide-test.c++ b/src/workerd/api/pyodide/pyodide-test.c++
@@ -0,0 +1,109 @@
+// Copyright (c) 2017-2022 Cloudflare, Inc.
+// Licensed under the Apache 2.0 license found in the LICENSE file or at:
+//     https://opensource.org/licenses/Apache-2.0
+
+#include "pyodide.h"
+#include <kj/test.h>
+
+namespace workerd::api {
+namespace {
+
+KJ_TEST("basic `import` tests") {
+  auto files = kj::heapArrayBuilder<kj::String>(2);
+  files.add(kj::str("import a\nimport z"));
+  files.add(kj::str("import b"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 3);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "z");
+  KJ_REQUIRE(result[2] == "b");
+}
+
+KJ_TEST("supports whitespace") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import      a\nimport    \n\tz"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("basic `from` test") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("from x import a,b\nfrom z import y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "x");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("ignores indented blocks") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a\nif True:\n  import x\nimport y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "y");
+}
+
+KJ_TEST("supports nested imports") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a.b\nimport z.x.y.i"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a.b");
+  KJ_REQUIRE(result[1] == "z.x.y.i");
+}
+
+KJ_TEST("nested `from` test") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("from x.y.z import a,b\nfrom z import y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "x.y.z");
+  KJ_REQUIRE(result[1] == "z");
+}
+
+KJ_TEST("ignores trailing period") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a.b.\nimport z.x.y.i."));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 0);
+}
+
+KJ_TEST("ignores relative import") {
+  // This is where we diverge from the old AST-based approach. It would have returned `y` in the
+  // input below.
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import .a.b\nimport ..z.x\nfrom .y import x"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 0);
+}
+
+KJ_TEST("supports commas") {
+  auto files = kj::heapArrayBuilder<kj::String>(1);
+  files.add(kj::str("import a,b"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 2);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "b");
+}
+
+KJ_TEST("supports backslash") {
+  auto files = kj::heapArrayBuilder<kj::String>(4);
+  files.add(kj::str("import a\\\n,b"));
+  files.add(kj::str("import\\\n q,w"));
+  files.add(kj::str("from \\\nx import y"));
+  files.add(kj::str("from \\\n   c import y"));
+  auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
+  KJ_REQUIRE(result.size() == 6);
+  KJ_REQUIRE(result[0] == "a");
+  KJ_REQUIRE(result[1] == "b");
+  KJ_REQUIRE(result[2] == "q");
+  KJ_REQUIRE(result[3] == "w");
+  KJ_REQUIRE(result[4] == "x");
+  KJ_REQUIRE(result[5] == "c");
+}
+
+}  // namespace
+}  // namespace workerd::api
diff --git a/src/workerd/api/pyodide/pyodide.c++ b/src/workerd/api/pyodide/pyodide.c++
@@ -1,3 +1,6 @@
+// Copyright (c) 2017-2022 Cloudflare, Inc.
+// Licensed under the Apache 2.0 license found in the LICENSE file or at:
+//     https://opensource.org/licenses/Apache-2.0
 #include "pyodide.h"
 #include <kj/string.h>
 #include <workerd/util/string-buffer.h>
@@ -64,6 +67,17 @@ kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getNames(jsg::Lock&
   return builder.finish();
 }
 
+kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getWorkerFiles(
+    jsg::Lock& js, kj::String ext) {
+  auto builder = kj::heapArrayBuilder<jsg::JsRef<jsg::JsString>>(this->names.size());
+  for (auto i: kj::zeroTo(builder.capacity())) {
+    if (this->names[i].endsWith(ext)) {
+      builder.add(js, js.str(this->contents[i]));
+    }
+  }
+  return builder.finish();
+}
+
 kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getRequirements(jsg::Lock& js) {
   auto builder = kj::heapArrayBuilder<jsg::JsRef<jsg::JsString>>(this->requirements.size());
   for (auto i: kj::zeroTo(builder.capacity())) {
@@ -102,6 +116,176 @@ int ArtifactBundler::readMemorySnapshot(int offset, kj::Array<kj::byte> buf) {
   return readToTarget(KJ_REQUIRE_NONNULL(existingSnapshot), offset, buf);
 }
 
+kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::String> files) {
+  auto result = kj::Vector<kj::String>();
+
+  for (auto& file: files) {
+    // Returns the number of characters skipped. When `oneOf` is not found, skips to the end of
+    // the string.
+    auto skipUntil = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
+      int result = 0;
+      while (start + result < str.size()) {
+        char c = str[start + result];
+        for (char expected: oneOf) {
+          if (c == expected) {
+            return result;
+          }
+        }
+
+        result++;
+      }
+
+      return result;
+    };
+
+    // Skips while current character is in `oneOf`. Returns the number of characters skipped.
+    auto skipWhile = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
+      int result = 0;
+      while (start + result < str.size()) {
+        char c = str[start + result];
+        bool found = false;
+        for (char expected: oneOf) {
+          if (c == expected) {
+            result++;
+            found = true;
+            break;
+          }
+        }
+
+        if (!found) {
+          break;
+        }
+      }
+
+      return result;
+    };
+
+    // Skips one of the characters (specified in `oneOf`) at the current position. Otherwise
+    // throws. Returns the number of characters skipped.
+    auto skipChar = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
+      for (char expected: oneOf) {
+        if (str[start] == expected) {
+          return 1;
+        }
+      }
+
+      KJ_FAIL_REQUIRE("Expected ", oneOf, "but received", str[start]);
+    };
+
+    auto parseKeyword = [](kj::StringPtr str, kj::StringPtr ident, int start) -> bool {
+      int i = 0;
+      for (; i < ident.size() && start + i < str.size(); i++) {
+        if (str[start + i] != ident[i]) {
+          return false;
+        }
+      }
+
+      return i == ident.size();
+    };
+
+    // Returns the size of the import identifier or 0 if no identifier exists at `start`.
+    auto parseIdent = [](kj::StringPtr str, int start) -> int {
+      // https://docs.python.org/3/reference/lexical_analysis.html#identifiers
+      //
+      // We also accept `.` because import idents can contain it.
+      if (std::isdigit(str[start])) {
+        return 0;
+      }
+      int i = 0;
+      for (; start + i < str.size(); i++) {
+        char c = str[start + i];
+        bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.';
+        if (!validIdentChar) {
+          return i;
+        }
+      }
+
+      return i;
+    };
+
+    int i = 0;
+    while (i < file.size()) {
+      auto keywordToParse = file[i] == 'i' ? "import"_kj : "from"_kj;
+      switch (file[i]) {
+        case ' ':
+        case '\t':
+          // Indented line, skip to newline.
+          i += skipUntil(file, {'\n', '\r'}, i);
+          if (file[i] != '\0') {
+            i += skipChar(file, {'\n', '\r'}, i);  // skip newline.
+          }
+          break;
+        case '\n':
+          // Windows users may submit \r\n newlines, block above would skip the `\r` so we'll be
+          // left with `\n` which we skip explicitly here.
+          i += 1;
+          break;
+        case 'i':
+        case 'f':
+          if (!parseKeyword(file, keywordToParse, i)) {
+            i++;
+            continue;
+          }
+          i += keywordToParse.size();  // skip "import" or "from"
+
+          while (i < file.size()) {
+            // Python expects a `\` to be paired with a newline, but we don't have to be as strict
+            // here.
+            i += skipWhile(
+                file, {'\r', '\n', ' ', '\t', '\\'}, i);  // skip whitespace and backslash.
+
+            if (file[i] == '.') {
+              // ignore relative imports
+              break;
+            }
+
+            int identLen = parseIdent(file, i);
+            KJ_REQUIRE(identLen > 0);
+
+            kj::String ident = kj::heapString(file.slice(i, i + identLen));
+            if (ident[identLen - 1] != '.') {  // trailing period means the import is invalid
+              result.add(kj::mv(ident));
+            }
+
+            i += identLen;
+
+            // If "import" statement then look for comma.
+            if (keywordToParse == "import") {
+              i += skipWhile(
+                  file, {'\r', '\n', ' ', '\t', '\\'}, i);  // skip whitespace and backslash.
+              // Check if next char is a comma.
+              if (file[i] == ',') {
+                i += 1;  // Skip comma.
+                // Allow while loop to continue
+              } else {
+                // No more idents, so break out of loop.
+                break;
+              }
+            } else {
+              // The "from" statement doesn't support commas.
+              break;
+            }
+          }
+          break;
+        default:
+          // Skip to the next line.
+          i += skipUntil(file, {'\n', '\r'}, i);
+          if (file[i] != '\0') {
+            i += skipChar(file, {'\n', '\r'}, i);  // skip newline.
+          }
+      }
+    }
+  }
+
+  // XXX: jsg doesn't support kj::Vector return types, so this seems to be the only way to do this.
+  auto builder = kj::heapArrayBuilder<kj::String>(result.size());
+  for (auto i = 0; i < result.size(); i++) {
+    builder.add(kj::mv(result[i]));
+  }
+
+  return builder.finish();
+}
+
 jsg::Ref<PyodideMetadataReader> makePyodideMetadataReader(
     Worker::Reader conf, const PythonConfig& pythonConfig) {
   auto modules = conf.getModules();