Skip to content

Commit

Permalink
Implements Python import statement parser to avoid ast module.
Browse files Browse the repository at this point in the history
  • Loading branch information
dom96 committed Sep 13, 2024
1 parent 6a236a1 commit dab3faa
Show file tree
Hide file tree
Showing 7 changed files with 346 additions and 10 deletions.
23 changes: 15 additions & 8 deletions src/pyodide/internal/snapshot.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import {
MEMORY_SNAPSHOT_READER,
} from 'pyodide-internal:metadata';
import { reportError, simpleRunPython } from 'pyodide-internal:util';
import { default as MetadataReader } from 'pyodide-internal:runtime-generated/metadata';

let LOADED_BASELINE_SNAPSHOT: number;

Expand Down Expand Up @@ -271,14 +272,20 @@ function memorySnapshotDoImports(Module: Module): Array<string> {
);
simpleRunPython(Module, processScriptImportsString);

const importedModules: Array<string> = JSON.parse(
simpleRunPython(
Module,
'import sys, json; print(json.dumps(CF_LOADED_MODULES), file=sys.stderr)'
)
);

return importedModules;
// The `importedModules` list will contain all modules that have been imported, including local
// modules, the usual `js` and other stdlib modules. We want to filter out local imports, so we
// grab them and put them into a set for fast filtering.
const localModulePaths: Set<string> = new Set<string>(MetadataReader.getNames());
// @ts-ignore parsePythonScriptImports is a static method.
const importedModules: Array<string> = ArtifactBundler.constructor.parsePythonScriptImports(
MetadataReader.getWorkerFiles("py")
).filter((module: string) => {
const moduleFilename = module.replace(".", "/") + ".py";
return !localModulePaths.has(moduleFilename) && module != "js"
});

// Deduplicate the modules.
return [...new Set(importedModules)];
}

function checkLoadedSoFiles(dsoJSON: DylinkInfo): void {
Expand Down
1 change: 1 addition & 0 deletions src/pyodide/types/FS.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ interface FS {
mode: number
): FSNode<Info>;
isFile: (mode: number) => boolean;
readdir: (path: string) => Array<string>;
genericErrors: Error[];
}

Expand Down
1 change: 1 addition & 0 deletions src/pyodide/types/runtime-generated/metadata.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ declare namespace MetadataReader {
const getMainModule: () => string;
const hasMemorySnapshot: () => boolean;
const getNames: () => string[];
const getWorkerFiles: (ext: string) => string[];
const getSizes: () => number[];
const readMemorySnapshot: (
offset: number,
Expand Down
8 changes: 8 additions & 0 deletions src/workerd/api/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,14 @@ wd_cc_library(
],
)

kj_test(
src = "pyodide/pyodide-test.c++",
deps = [
":pyodide",
"//src/workerd/tests:test-fixture",
],
)

wd_cc_library(
name = "data-url",
srcs = ["data-url.c++"],
Expand Down
109 changes: 109 additions & 0 deletions src/workerd/api/pyodide/pyodide-test.c++
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Copyright (c) 2017-2022 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0

#include "pyodide.h"
#include <kj/test.h>

namespace workerd::api {
namespace {

KJ_TEST("basic `import` tests") {
auto files = kj::heapArrayBuilder<kj::String>(2);
files.add(kj::str("import a\nimport z"));
files.add(kj::str("import b"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 3);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "z");
KJ_REQUIRE(result[2] == "b");
}

KJ_TEST("supports whitespace") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a\nimport \n\tz"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("basic `from` test") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("from x import a,b\nfrom z import y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "x");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("ignores indented blocks") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a\nif True:\n import x\nimport y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "y");
}

KJ_TEST("supports nested imports") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a.b\nimport z.x.y.i"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a.b");
KJ_REQUIRE(result[1] == "z.x.y.i");
}

KJ_TEST("nested `from` test") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("from x.y.z import a,b\nfrom z import y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "x.y.z");
KJ_REQUIRE(result[1] == "z");
}

KJ_TEST("ignores trailing period") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a.b.\nimport z.x.y.i."));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 0);
}

KJ_TEST("ignores relative import") {
// This is where we diverge from the old AST-based approach. It would have returned `y` in the
// input below.
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import .a.b\nimport ..z.x\nfrom .y import x"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 0);
}

KJ_TEST("supports commas") {
auto files = kj::heapArrayBuilder<kj::String>(1);
files.add(kj::str("import a,b"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 2);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "b");
}

KJ_TEST("supports backslash") {
auto files = kj::heapArrayBuilder<kj::String>(4);
files.add(kj::str("import a\\\n,b"));
files.add(kj::str("import\\\n q,w"));
files.add(kj::str("from \\\nx import y"));
files.add(kj::str("from \\\n c import y"));
auto result = pyodide::ArtifactBundler::parsePythonScriptImports(files.finish());
KJ_REQUIRE(result.size() == 6);
KJ_REQUIRE(result[0] == "a");
KJ_REQUIRE(result[1] == "b");
KJ_REQUIRE(result[2] == "q");
KJ_REQUIRE(result[3] == "w");
KJ_REQUIRE(result[4] == "x");
KJ_REQUIRE(result[5] == "c");
}

} // namespace
} // namespace workerd::api
184 changes: 184 additions & 0 deletions src/workerd/api/pyodide/pyodide.c++
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// Copyright (c) 2017-2022 Cloudflare, Inc.
// Licensed under the Apache 2.0 license found in the LICENSE file or at:
// https://opensource.org/licenses/Apache-2.0
#include "pyodide.h"
#include <kj/string.h>
#include <workerd/util/string-buffer.h>
Expand Down Expand Up @@ -64,6 +67,17 @@ kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getNames(jsg::Lock&
return builder.finish();
}

kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getWorkerFiles(
jsg::Lock& js, kj::String ext) {
auto builder = kj::heapArrayBuilder<jsg::JsRef<jsg::JsString>>(this->names.size());
for (auto i: kj::zeroTo(builder.capacity())) {
if (this->names[i].endsWith(ext)) {
builder.add(js, js.str(this->contents[i]));
}
}
return builder.finish();
}

kj::Array<jsg::JsRef<jsg::JsString>> PyodideMetadataReader::getRequirements(jsg::Lock& js) {
auto builder = kj::heapArrayBuilder<jsg::JsRef<jsg::JsString>>(this->requirements.size());
for (auto i: kj::zeroTo(builder.capacity())) {
Expand Down Expand Up @@ -102,6 +116,176 @@ int ArtifactBundler::readMemorySnapshot(int offset, kj::Array<kj::byte> buf) {
return readToTarget(KJ_REQUIRE_NONNULL(existingSnapshot), offset, buf);
}

kj::Array<kj::String> ArtifactBundler::parsePythonScriptImports(kj::Array<kj::String> files) {
auto result = kj::Vector<kj::String>();

for (auto& file: files) {
// Returns the number of characters skipped. When `oneOf` is not found, skips to the end of
// the string.
auto skipUntil = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
int result = 0;
while (start + result < str.size()) {
char c = str[start + result];
for (char expected: oneOf) {
if (c == expected) {
return result;
}
}

result++;
}

return result;
};

// Skips while current character is in `oneOf`. Returns the number of characters skipped.
auto skipWhile = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
int result = 0;
while (start + result < str.size()) {
char c = str[start + result];
bool found = false;
for (char expected: oneOf) {
if (c == expected) {
result++;
found = true;
break;
}
}

if (!found) {
break;
}
}

return result;
};

// Skips one of the characters (specified in `oneOf`) at the current position. Otherwise
// throws. Returns the number of characters skipped.
auto skipChar = [](kj::StringPtr str, std::initializer_list<char> oneOf, int start) -> int {
for (char expected: oneOf) {
if (str[start] == expected) {
return 1;
}
}

KJ_FAIL_REQUIRE("Expected ", oneOf, "but received", str[start]);
};

auto parseKeyword = [](kj::StringPtr str, kj::StringPtr ident, int start) -> bool {
int i = 0;
for (; i < ident.size() && start + i < str.size(); i++) {
if (str[start + i] != ident[i]) {
return false;
}
}

return i == ident.size();
};

// Returns the size of the import identifier or 0 if no identifier exists at `start`.
auto parseIdent = [](kj::StringPtr str, int start) -> int {
// https://docs.python.org/3/reference/lexical_analysis.html#identifiers
//
// We also accept `.` because import idents can contain it.
if (std::isdigit(str[start])) {
return 0;
}
int i = 0;
for (; start + i < str.size(); i++) {
char c = str[start + i];
bool validIdentChar = std::isalpha(c) || std::isdigit(c) || c == '_' || c == '.';
if (!validIdentChar) {
return i;
}
}

return i;
};

int i = 0;
while (i < file.size()) {
auto keywordToParse = file[i] == 'i' ? "import"_kj : "from"_kj;
switch (file[i]) {
case ' ':
case '\t':
// Indented line, skip to newline.
i += skipUntil(file, {'\n', '\r'}, i);
if (file[i] != '\0') {
i += skipChar(file, {'\n', '\r'}, i); // skip newline.
}
break;
case '\n':
// Windows users may submit \r\n newlines, block above would skip the `\r` so we'll be
// left with `\n` which we skip explicitly here.
i += 1;
break;
case 'i':
case 'f':
if (!parseKeyword(file, keywordToParse, i)) {
i++;
continue;
}
i += keywordToParse.size(); // skip "import" or "from"

while (i < file.size()) {
// Python expects a `\` to be paired with a newline, but we don't have to be as strict
// here.
i += skipWhile(
file, {'\r', '\n', ' ', '\t', '\\'}, i); // skip whitespace and backslash.

if (file[i] == '.') {
// ignore relative imports
break;
}

int identLen = parseIdent(file, i);
KJ_REQUIRE(identLen > 0);

kj::String ident = kj::heapString(file.slice(i, i + identLen));
if (ident[identLen - 1] != '.') { // trailing period means the import is invalid
result.add(kj::mv(ident));
}

i += identLen;

// If "import" statement then look for comma.
if (keywordToParse == "import") {
i += skipWhile(
file, {'\r', '\n', ' ', '\t', '\\'}, i); // skip whitespace and backslash.
// Check if next char is a comma.
if (file[i] == ',') {
i += 1; // Skip comma.
// Allow while loop to continue
} else {
// No more idents, so break out of loop.
break;
}
} else {
// The "from" statement doesn't support commas.
break;
}
}
break;
default:
// Skip to the next line.
i += skipUntil(file, {'\n', '\r'}, i);
if (file[i] != '\0') {
i += skipChar(file, {'\n', '\r'}, i); // skip newline.
}
}
}
}

// XXX: jsg doesn't support kj::Vector return types, so this seems to be the only way to do this.
auto builder = kj::heapArrayBuilder<kj::String>(result.size());
for (auto i = 0; i < result.size(); i++) {
builder.add(kj::mv(result[i]));
}

return builder.finish();
}

jsg::Ref<PyodideMetadataReader> makePyodideMetadataReader(
Worker::Reader conf, const PythonConfig& pythonConfig) {
auto modules = conf.getModules();
Expand Down
Loading

0 comments on commit dab3faa

Please sign in to comment.