Skip to content

Commit

Permalink
pyembed: encode is_package flag into embedded module data
Browse files Browse the repository at this point in the history
Before, we derived is_package by the presence of a foo.bar module.
Now, the canonical source of is_package is derived at build time and
embedded into the binary. This allows us to properly flag __init__
files as providing packages, thus fixing #53.
  • Loading branch information
indygreg committed Jul 1, 2019
1 parent dec718e commit 7368eb1
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 20 deletions.
21 changes: 21 additions & 0 deletions docs/history.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,27 @@ Blog Posts
Version History
===============

next
----

*Not yet released*

Backwards Compatibility Notes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

* The format of embedded Python module data has changed. The ``pyembed`` crate
and ``pyoxidizer`` versions must match exactly or else the ``pyembed`` crate
will likely crash at run-time when parsing module data.

Bug Fixes
^^^^^^^^^

* The in-memory module importer now correctly populates ``__package__`` in
more cases than it did previously. Before, whether a module was a package
was derived from the presence of a ``foo.bar`` module. Now, a module will be
identified as a package if the file providing it is named ``__init__``. This
more closely matches the behavior of Python's filesystem based importer. (#53)

0.2.0
-----

Expand Down
8 changes: 6 additions & 2 deletions docs/pyembed.rst
Original file line number Diff line number Diff line change
Expand Up @@ -328,10 +328,14 @@ The first 4 bytes are a little endian u32 containing the total number of
modules in this data. Let's call this value ``total``.

Following is an array of length ``total`` with each array element being
a 3-tuples of packed (no interior or exterior padding) composed of 3
a 3-tuple of packed (no interior or exterior padding) composed of 4
little endian u32 values. These values correspond to the module name
length (``name_length``), module source data length (``source_length``),
and module bytecode data length (``bytecode_length``), respectively.
module bytecode data length (``bytecode_length``), and a ``flags`` field
to denote special behavior, respectively.

The least significant bit of the ``flags`` field is set if the
corresponding module name is a package.

Following the lengths array is a vector of the module name strings.
This vector has ``total`` elements. Each element is a non-NULL terminated
Expand Down
49 changes: 31 additions & 18 deletions pyoxidizer/src/pyembed/importer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ impl PythonModuleData {
///
/// This is essentially an index over a raw backing blob.
struct PythonModulesData {
/// Packages in this set of modules.
packages: HashSet<&'static str>,

/// Maps module name to source/bytecode.
data: HashMap<&'static str, PythonModuleData>,
}

Expand All @@ -80,6 +84,7 @@ impl PythonModulesData {
let mut index = Vec::with_capacity(count as usize);
let mut total_names_length = 0;
let mut total_sources_length = 0;
let mut package_count = 0;

for _ in 0..count {
let name_length = reader
Expand All @@ -94,20 +99,30 @@ impl PythonModulesData {
.read_u32::<LittleEndian>()
.or_else(|_| Err("failed reading bytecode length"))?
as usize;
let flags = reader
.read_u32::<LittleEndian>()
.or_else(|_| Err("failed reading module flags"))?;

index.push((name_length, source_length, bytecode_length));
let is_package = flags & 0x01 != 0;

if is_package {
package_count += 1;
}

index.push((name_length, source_length, bytecode_length, is_package));
total_names_length += name_length;
total_sources_length += source_length;
}

let mut res = HashMap::with_capacity(count as usize);
let mut packages = HashSet::with_capacity(package_count);
let sources_start_offset = reader.position() as usize + total_names_length;
let bytecodes_start_offset = sources_start_offset + total_sources_length;

let mut sources_current_offset: usize = 0;
let mut bytecodes_current_offset: usize = 0;

for (name_length, source_length, bytecode_length) in index {
for (name_length, source_length, bytecode_length, is_package) in index {
let offset = reader.position() as usize;

let name =
Expand All @@ -132,10 +147,21 @@ impl PythonModulesData {
sources_current_offset += source_length;
bytecodes_current_offset += bytecode_length;

res.insert(name, PythonModuleData { source, bytecode });
if is_package {
packages.insert(name);
}

// Extension modules will have their names present to populate the
// packages set. So only populate module data if we have data for it.
if source.is_some() || bytecode.is_some() {
res.insert(name, PythonModuleData { source, bytecode });
}
}

Ok(PythonModulesData { data: res })
Ok(PythonModulesData {
packages,
data: res,
})
}
}

Expand Down Expand Up @@ -495,15 +521,6 @@ py_class!(class PyOxidizerResourceReader |py| {
}
});

fn populate_packages(packages: &mut HashSet<&'static str>, name: &'static str) {
let mut search = name;

while let Some(idx) = search.rfind('.') {
packages.insert(&search[0..idx]);
search = &search[0..idx];
}
}

const DOC: &[u8] = b"Binary representation of Python modules\0";

/// Represents global module state to be passed at interpreter initialization time.
Expand Down Expand Up @@ -723,17 +740,13 @@ fn module_setup(
known_modules.insert(name_str, KnownModuleFlavor::Frozen);
}

// TODO consider baking set of packages into embedded data.
let mut packages: HashSet<&'static str> = HashSet::with_capacity(modules_data.data.len());

for (name, record) in modules_data.data {
known_modules.insert(
name,
KnownModuleFlavor::InMemory {
module_data: record,
},
);
populate_packages(&mut packages, name);
}

let resources_data = match PythonResourcesData::from(state.py_resources_data) {
Expand Down Expand Up @@ -779,7 +792,7 @@ fn module_setup(
module_spec_type,
decode_source,
exec_fn,
packages,
modules_data.packages,
known_modules,
resources_data.packages,
resource_readers,
Expand Down
11 changes: 11 additions & 0 deletions pyoxidizer/src/pyrepackager/repackage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,7 @@ impl BuildContext {
/// Represents a single module's data record.
pub struct ModuleEntry {
pub name: String,
pub is_package: bool,
pub source: Option<Vec<u8>>,
pub bytecode: Option<Vec<u8>>,
}
Expand Down Expand Up @@ -279,6 +280,7 @@ impl EmbeddedPythonResources {

records.push(ModuleEntry {
name: name.clone(),
is_package: self.all_packages.contains(name),
source: match source {
Some(value) => Some(value.source.clone()),
None => None,
Expand Down Expand Up @@ -818,6 +820,8 @@ pub fn resolve_python_resources(
}

for (name, extension) in &embedded_built_extension_modules {
all_embedded_modules.insert(name.clone());

if extension.is_package {
annotated_package_names.insert(name.clone());
}
Expand Down Expand Up @@ -936,6 +940,13 @@ pub fn write_modules_entries<W: Write>(
} else {
0
})?;

let mut flags = 0;
if entry.is_package {
flags |= 1;
}

dest.write_u32::<LittleEndian>(flags)?;
}

for entry in entries.iter() {
Expand Down

0 comments on commit 7368eb1

Please sign in to comment.