diff --git a/misc/apply-cache-diff.py b/misc/apply-cache-diff.py index 29c55247de92..8ede9766bd06 100644 --- a/misc/apply-cache-diff.py +++ b/misc/apply-cache-diff.py @@ -8,13 +8,13 @@ from __future__ import annotations import argparse -import json import os import sys sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore +from mypy.util import json_dumps, json_loads def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: @@ -26,10 +26,10 @@ def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None: cache = make_cache(cache_dir, sqlite) - with open(diff_file) as f: - diff = json.load(f) + with open(diff_file, "rb") as f: + diff = json_loads(f.read()) - old_deps = json.loads(cache.read("@deps.meta.json")) + old_deps = json_loads(cache.read("@deps.meta.json")) for file, data in diff.items(): if data is None: @@ -37,10 +37,10 @@ def apply_diff(cache_dir: str, diff_file: str, sqlite: bool = False) -> None: else: cache.write(file, data) if file.endswith(".meta.json") and "@deps" not in file: - meta = json.loads(data) + meta = json_loads(data) old_deps["snapshot"][meta["id"]] = meta["hash"] - cache.write("@deps.meta.json", json.dumps(old_deps)) + cache.write("@deps.meta.json", json_dumps(old_deps)) cache.commit() diff --git a/misc/diff-cache.py b/misc/diff-cache.py index 15d3e5a83983..8441caf81304 100644 --- a/misc/diff-cache.py +++ b/misc/diff-cache.py @@ -8,7 +8,6 @@ from __future__ import annotations import argparse -import json import os import sys from collections import defaultdict @@ -17,6 +16,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from mypy.metastore import FilesystemMetadataStore, MetadataStore, SqliteMetadataStore +from mypy.util import json_dumps, json_loads def make_cache(input_dir: str, sqlite: bool) -> MetadataStore: @@ -33,7 +33,7 @@ def merge_deps(all: dict[str, set[str]], new: dict[str, set[str]]) -> None: def load(cache: MetadataStore, s: str) -> Any: data = cache.read(s) - obj = json.loads(data) + obj = json_loads(data) if s.endswith(".meta.json"): # For meta files, zero out the mtimes and sort the # dependencies to avoid spurious conflicts @@ -73,7 +73,7 @@ def main() -> None: type_misses: dict[str, int] = defaultdict(int) type_hits: dict[str, int] = defaultdict(int) - updates: dict[str, str | None] = {} + updates: dict[str, bytes | None] = {} deps1: dict[str, set[str]] = {} deps2: dict[str, set[str]] = {} @@ -96,7 +96,7 @@ def main() -> None: # so we can produce a much smaller direct diff of them. if ".deps." not in s: if obj2 is not None: - updates[s] = json.dumps(obj2) + updates[s] = json_dumps(obj2) else: updates[s] = None elif obj2: @@ -122,7 +122,7 @@ def main() -> None: merge_deps(new_deps, root_deps) new_deps_json = {k: list(v) for k, v in new_deps.items() if v} - updates["@root.deps.json"] = json.dumps(new_deps_json) + updates["@root.deps.json"] = json_dumps(new_deps_json) # Drop updates to deps.meta.json for size reasons. The diff # applier will manually fix it up. @@ -136,8 +136,8 @@ def main() -> None: print("hits", type_hits) print("misses", type_misses) - with open(args.output, "w") as f: - json.dump(updates, f) + with open(args.output, "wb") as f: + f.write(json_dumps(updates)) if __name__ == "__main__": diff --git a/mypy/build.py b/mypy/build.py index 043b52f0a241..fd1c63868a69 100644 --- a/mypy/build.py +++ b/mypy/build.py @@ -95,6 +95,7 @@ from mypy.stubinfo import legacy_bundled_packages, non_bundled_packages, stub_distribution_name from mypy.types import Type from mypy.typestate import reset_global_state, type_state +from mypy.util import json_dumps, json_loads from mypy.version import __version__ # Switch to True to produce debug output related to fine-grained incremental @@ -858,7 +859,7 @@ def load_fine_grained_deps(self, id: str) -> dict[str, set[str]]: t0 = time.time() if id in self.fg_deps_meta: # TODO: Assert deps file wasn't changed. - deps = json.loads(self.metastore.read(self.fg_deps_meta[id]["path"])) + deps = json_loads(self.metastore.read(self.fg_deps_meta[id]["path"])) else: deps = {} val = {k: set(v) for k, v in deps.items()} @@ -911,8 +912,8 @@ def stats_summary(self) -> Mapping[str, object]: return self.stats -def deps_to_json(x: dict[str, set[str]]) -> str: - return json.dumps({k: list(v) for k, v in x.items()}, separators=(",", ":")) +def deps_to_json(x: dict[str, set[str]]) -> bytes: + return json_dumps({k: list(v) for k, v in x.items()}) # File for storing metadata about all the fine-grained dependency caches @@ -980,7 +981,7 @@ def write_deps_cache( meta = {"snapshot": meta_snapshot, "deps_meta": fg_deps_meta} - if not metastore.write(DEPS_META_FILE, json.dumps(meta, separators=(",", ":"))): + if not metastore.write(DEPS_META_FILE, json_dumps(meta)): manager.log(f"Error writing fine-grained deps meta JSON file {DEPS_META_FILE}") error = True @@ -1048,7 +1049,7 @@ def generate_deps_for_cache(manager: BuildManager, graph: Graph) -> dict[str, di def write_plugins_snapshot(manager: BuildManager) -> None: """Write snapshot of versions and hashes of currently active plugins.""" - snapshot = json.dumps(manager.plugins_snapshot, separators=(",", ":")) + snapshot = json_dumps(manager.plugins_snapshot) if not manager.metastore.write(PLUGIN_SNAPSHOT_FILE, snapshot): manager.errors.set_file(_cache_dir_prefix(manager.options), None, manager.options) manager.errors.report(0, 0, "Error writing plugins snapshot", blocker=True) @@ -1079,8 +1080,8 @@ def read_quickstart_file( # just ignore it. raw_quickstart: dict[str, Any] = {} try: - with open(options.quickstart_file) as f: - raw_quickstart = json.load(f) + with open(options.quickstart_file, "rb") as f: + raw_quickstart = json_loads(f.read()) quickstart = {} for file, (x, y, z) in raw_quickstart.items(): @@ -1148,10 +1149,10 @@ def _load_json_file( manager.add_stats(metastore_read_time=time.time() - t0) # Only bother to compute the log message if we are logging it, since it could be big if manager.verbosity() >= 2: - manager.trace(log_success + data.rstrip()) + manager.trace(log_success + data.rstrip().decode()) try: t1 = time.time() - result = json.loads(data) + result = json_loads(data) manager.add_stats(data_json_load_time=time.time() - t1) except json.JSONDecodeError: manager.errors.set_file(file, None, manager.options) @@ -1343,8 +1344,8 @@ def find_cache_meta(id: str, path: str, manager: BuildManager) -> CacheMeta | No # So that plugins can return data with tuples in it without # things silently always invalidating modules, we round-trip # the config data. This isn't beautiful. - plugin_data = json.loads( - json.dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True))) + plugin_data = json_loads( + json_dumps(manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=True))) ) if m.plugin_data != plugin_data: manager.log(f"Metadata abandoned for {id}: plugin configuration differs") @@ -1478,10 +1479,7 @@ def validate_meta( "ignore_all": meta.ignore_all, "plugin_data": meta.plugin_data, } - if manager.options.debug_cache: - meta_str = json.dumps(meta_dict, indent=2, sort_keys=True) - else: - meta_str = json.dumps(meta_dict, separators=(",", ":")) + meta_bytes = json_dumps(meta_dict, manager.options.debug_cache) meta_json, _, _ = get_cache_names(id, path, manager.options) manager.log( "Updating mtime for {}: file {}, meta {}, mtime {}".format( @@ -1489,7 +1487,7 @@ def validate_meta( ) ) t1 = time.time() - manager.metastore.write(meta_json, meta_str) # Ignore errors, just an optimization. + manager.metastore.write(meta_json, meta_bytes) # Ignore errors, just an optimization. manager.add_stats(validate_update_time=time.time() - t1, validate_munging_time=t1 - t0) return meta @@ -1507,13 +1505,6 @@ def compute_hash(text: str) -> str: return hash_digest(text.encode("utf-8")) -def json_dumps(obj: Any, debug_cache: bool) -> str: - if debug_cache: - return json.dumps(obj, indent=2, sort_keys=True) - else: - return json.dumps(obj, sort_keys=True, separators=(",", ":")) - - def write_cache( id: str, path: str, @@ -1566,8 +1557,8 @@ def write_cache( # Serialize data and analyze interface data = tree.serialize() - data_str = json_dumps(data, manager.options.debug_cache) - interface_hash = compute_hash(data_str) + data_bytes = json_dumps(data, manager.options.debug_cache) + interface_hash = hash_digest(data_bytes) plugin_data = manager.plugin.report_config_data(ReportConfigContext(id, path, is_check=False)) @@ -1591,7 +1582,7 @@ def write_cache( manager.trace(f"Interface for {id} is unchanged") else: manager.trace(f"Interface for {id} has changed") - if not metastore.write(data_json, data_str): + if not metastore.write(data_json, data_bytes): # Most likely the error is the replace() call # (see https://github.com/python/mypy/issues/3215). manager.log(f"Error writing data JSON file {data_json}") @@ -3568,4 +3559,4 @@ def write_undocumented_ref_info( assert not ref_info_file.startswith(".") deps_json = get_undocumented_ref_info_json(state.tree, type_map) - metastore.write(ref_info_file, json.dumps(deps_json, separators=(",", ":"))) + metastore.write(ref_info_file, json_dumps(deps_json)) diff --git a/mypy/metastore.py b/mypy/metastore.py index 4caa7d7f0534..c9139045e3b1 100644 --- a/mypy/metastore.py +++ b/mypy/metastore.py @@ -33,14 +33,14 @@ def getmtime(self, name: str) -> float: """ @abstractmethod - def read(self, name: str) -> str: + def read(self, name: str) -> bytes: """Read the contents of a metadata entry. Raises FileNotFound if the entry does not exist. """ @abstractmethod - def write(self, name: str, data: str, mtime: float | None = None) -> bool: + def write(self, name: str, data: bytes, mtime: float | None = None) -> bool: """Write a metadata entry. If mtime is specified, set it as the mtime of the entry. Otherwise, @@ -86,16 +86,16 @@ def getmtime(self, name: str) -> float: return int(os.path.getmtime(os.path.join(self.cache_dir_prefix, name))) - def read(self, name: str) -> str: + def read(self, name: str) -> bytes: assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!" if not self.cache_dir_prefix: raise FileNotFoundError() - with open(os.path.join(self.cache_dir_prefix, name)) as f: + with open(os.path.join(self.cache_dir_prefix, name), "rb") as f: return f.read() - def write(self, name: str, data: str, mtime: float | None = None) -> bool: + def write(self, name: str, data: bytes, mtime: float | None = None) -> bool: assert os.path.normpath(name) != os.path.abspath(name), "Don't use absolute paths!" if not self.cache_dir_prefix: @@ -105,7 +105,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool: tmp_filename = path + "." + random_string() try: os.makedirs(os.path.dirname(path), exist_ok=True) - with open(tmp_filename, "w") as f: + with open(tmp_filename, "wb") as f: f.write(data) os.replace(tmp_filename, path) if mtime is not None: @@ -135,15 +135,13 @@ def list_all(self) -> Iterable[str]: SCHEMA = """ -CREATE TABLE IF NOT EXISTS files ( +CREATE TABLE IF NOT EXISTS files2 ( path TEXT UNIQUE NOT NULL, mtime REAL, - data TEXT + data BLOB ); -CREATE INDEX IF NOT EXISTS path_idx on files(path); +CREATE INDEX IF NOT EXISTS path_idx on files2(path); """ -# No migrations yet -MIGRATIONS: list[str] = [] def connect_db(db_file: str) -> sqlite3.Connection: @@ -151,11 +149,6 @@ def connect_db(db_file: str) -> sqlite3.Connection: db = sqlite3.dbapi2.connect(db_file) db.executescript(SCHEMA) - for migr in MIGRATIONS: - try: - db.executescript(migr) - except sqlite3.OperationalError: - pass return db @@ -176,7 +169,7 @@ def _query(self, name: str, field: str) -> Any: if not self.db: raise FileNotFoundError() - cur = self.db.execute(f"SELECT {field} FROM files WHERE path = ?", (name,)) + cur = self.db.execute(f"SELECT {field} FROM files2 WHERE path = ?", (name,)) results = cur.fetchall() if not results: raise FileNotFoundError() @@ -188,12 +181,12 @@ def getmtime(self, name: str) -> float: assert isinstance(mtime, float) return mtime - def read(self, name: str) -> str: + def read(self, name: str) -> bytes: data = self._query(name, "data") - assert isinstance(data, str) + assert isinstance(data, bytes) return data - def write(self, name: str, data: str, mtime: float | None = None) -> bool: + def write(self, name: str, data: bytes, mtime: float | None = None) -> bool: import sqlite3 if not self.db: @@ -202,7 +195,7 @@ def write(self, name: str, data: str, mtime: float | None = None) -> bool: if mtime is None: mtime = time.time() self.db.execute( - "INSERT OR REPLACE INTO files(path, mtime, data) VALUES(?, ?, ?)", + "INSERT OR REPLACE INTO files2(path, mtime, data) VALUES(?, ?, ?)", (name, mtime, data), ) except sqlite3.OperationalError: @@ -213,7 +206,7 @@ def remove(self, name: str) -> None: if not self.db: raise FileNotFoundError() - self.db.execute("DELETE FROM files WHERE path = ?", (name,)) + self.db.execute("DELETE FROM files2 WHERE path = ?", (name,)) def commit(self) -> None: if self.db: @@ -221,5 +214,5 @@ def commit(self) -> None: def list_all(self) -> Iterable[str]: if self.db: - for row in self.db.execute("SELECT path FROM files"): + for row in self.db.execute("SELECT path FROM files2"): yield row[0] diff --git a/mypy/util.py b/mypy/util.py index 2bb6b85ae0b0..ae5c7ffb20cb 100644 --- a/mypy/util.py +++ b/mypy/util.py @@ -4,6 +4,7 @@ import hashlib import io +import json import os import pathlib import re @@ -11,9 +12,15 @@ import sys import time from importlib import resources as importlib_resources -from typing import IO, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar +from typing import IO, Any, Callable, Container, Final, Iterable, Sequence, Sized, TypeVar from typing_extensions import Literal +orjson: Any +try: + import orjson # type: ignore[import-not-found, no-redef, unused-ignore] +except ImportError: + orjson = None + try: import curses @@ -889,3 +896,25 @@ def quote_docstring(docstr: str) -> str: return f"''{docstr_repr}''" else: return f'""{docstr_repr}""' + + +def json_dumps(obj: object, debug: bool = False) -> bytes: + if orjson is not None: + if debug: + return orjson.dumps(obj, option=orjson.OPT_INDENT_2 | orjson.OPT_SORT_KEYS) # type: ignore[no-any-return] + else: + # TODO: If we don't sort keys here, testIncrementalInternalScramble fails + # We should document exactly what is going on there + return orjson.dumps(obj, option=orjson.OPT_SORT_KEYS) # type: ignore[no-any-return] + + if debug: + return json.dumps(obj, indent=2, sort_keys=True).encode("utf-8") + else: + # See above for sort_keys comment + return json.dumps(obj, sort_keys=True, separators=(",", ":")).encode("utf-8") + + +def json_loads(data: bytes) -> Any: + if orjson is not None: + return orjson.loads(data) + return json.loads(data) diff --git a/mypyc/codegen/emitmodule.py b/mypyc/codegen/emitmodule.py index 1d8708912de5..b5e0a37f0cca 100644 --- a/mypyc/codegen/emitmodule.py +++ b/mypyc/codegen/emitmodule.py @@ -24,7 +24,7 @@ from mypy.nodes import MypyFile from mypy.options import Options from mypy.plugin import Plugin, ReportConfigContext -from mypy.util import hash_digest +from mypy.util import hash_digest, json_dumps from mypyc.codegen.cstring import c_string_initializer from mypyc.codegen.emit import Emitter, EmitterContext, HeaderDeclaration, c_array_initializer from mypyc.codegen.emitclass import generate_class, generate_class_type_decl @@ -154,7 +154,7 @@ def report_config_data(self, ctx: ReportConfigContext) -> tuple[str | None, list ir_data = json.loads(ir_json) # Check that the IR cache matches the metadata cache - if compute_hash(meta_json) != ir_data["meta_hash"]: + if hash_digest(meta_json) != ir_data["meta_hash"]: return None # Check that all of the source files are present and as @@ -369,11 +369,11 @@ def write_cache( newpath = get_state_ir_cache_name(st) ir_data = { "ir": module.serialize(), - "meta_hash": compute_hash(meta_data), + "meta_hash": hash_digest(meta_data), "src_hashes": hashes[group_map[id]], } - result.manager.metastore.write(newpath, json.dumps(ir_data, separators=(",", ":"))) + result.manager.metastore.write(newpath, json_dumps(ir_data)) result.manager.metastore.commit()