From 306600998424e8e82bd0ccbf50bcee5ad64605e0 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Tue, 10 May 2022 11:05:09 -0400 Subject: [PATCH 01/12] Makes the default behavior to auto-match dict entries if they have the same key --- graphtage/multiset.py | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/graphtage/multiset.py b/graphtage/multiset.py index e855fd1..5c45fdf 100644 --- a/graphtage/multiset.py +++ b/graphtage/multiset.py @@ -7,6 +7,7 @@ from typing import Iterator, List +import graphtage from .bounds import Range from .edits import Insert, Match, Remove from .matching import WeightedBipartiteMatcher @@ -27,7 +28,8 @@ def __init__( from_node: SequenceNode, to_node: SequenceNode, from_set: HashableCounter[TreeNode], - to_set: HashableCounter[TreeNode] + to_set: HashableCounter[TreeNode], + auto_match_keys: bool = True ): """Initializes the edit. @@ -38,8 +40,33 @@ def __init__( this is neither checked nor enforced. to_set: The set of nodes to which to match. These should typically be children of :obj:`to_node`, but this is neither checked nor enforced. + auto_match_keys: If `True`, any :class:`graphtage.KeyValuePairNode`s in :obj:`from_set` that have keys + equal to :class:`graphtage.KeyValuePairNode`s in :obj:`to_set` will automatically be matched. Setting + this to `False` will require a significant amount more computation for larger dictionaries. """ + self._matched_kvp_edits: List[Edit] = [] + if auto_match_keys: + to_set = HashableCounter(to_set) + from_set = HashableCounter(from_set) + to_remove_from = [] + for f in from_set.keys(): + if not isinstance(f, graphtage.KeyValuePairNode): + continue + for t in to_set.keys(): + if not isinstance(f, graphtage.KeyValuePairNode): + continue + if f.key == t.key: + num_matched = min(from_set[f], to_set[t]) + for _ in range(num_matched): + self._matched_kvp_edits.append(f.edits(t)) + to_remove_from.append((f, num_matched)) + break + else: + continue + to_set[t] -= num_matched + for f, num_matched in to_remove_from: + from_set[f] -= num_matched self.to_insert = to_set - from_set """The set of nodes in :obj:`to_set` that do not exist in :obj:`from_set`.""" self.to_remove = from_set - to_set @@ -61,6 +88,7 @@ def is_complete(self) -> bool: def edits(self) -> Iterator[Edit]: yield from self._edits + yield from self._matched_kvp_edits remove_matched: HashableCounter[TreeNode] = HashableCounter() insert_matched: HashableCounter[TreeNode] = HashableCounter() for (rem, (ins, edit)) in self._matcher.matching.items(): @@ -74,10 +102,15 @@ def edits(self) -> Iterator[Edit]: def tighten_bounds(self) -> bool: """Delegates to :meth:`WeightedBipartiteMatcher.tighten_bounds`.""" + for kvp_edit in self._matched_kvp_edits: + if kvp_edit.tighten_bounds(): + return True return self._matcher.tighten_bounds() def bounds(self) -> Range: b = self._matcher.bounds() + for kvp_edit in self._matched_kvp_edits: + b = b + kvp_edit.bounds() if len(self.to_remove) > len(self.to_insert): for edit in largest( *(Remove(to_remove=r, remove_from=self.from_node) for r in self.to_remove), From a8d5e88bac08dcdc89216065d76d6b4a4078659c Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Tue, 10 May 2022 11:26:45 -0400 Subject: [PATCH 02/12] Started adding command line options for the dict key optimization --- graphtage/__main__.py | 31 +++++++++++++++++++++++++++---- graphtage/graphtage.py | 8 ++++++-- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/graphtage/__main__.py b/graphtage/__main__.py index f8964c7..e4270c4 100644 --- a/graphtage/__main__.py +++ b/graphtage/__main__.py @@ -135,14 +135,23 @@ def main(argv=None) -> int: formatting.add_argument('--join-lists', '-jl', action='store_true', help='do not print a newline after each list entry') formatting.add_argument('--join-dict-items', '-jd', action='store_true', - help='do not print a newline after each key/value pair in a dictionary') + help='do not print a newline after each key/value pair in a dictionary') formatting.add_argument('--condensed', '-j', action='store_true', help='equivalent to `-jl -jd`') formatting.add_argument('--html', action='store_true', help='output the diff in HTML') - parser.add_argument( + key_match_strategy = parser.add_mutually_exclusive_group() + key_match_strategy.add_argument("--dict-strategy", choices=("auto", "match", "none"), + help="sets the strategy for matching dictionary key/value pairs: `auto` (the " + "default) will automatically match two key/value pairs if they share the " + "same key, but consider key edits for all non-identical keys; `match` will " + "attempt to consider all possible key edits (the most computationally " + "expensive); and `none` will not consider any edits on dictionary keys (the " + "least computationally expensive)") + key_match_strategy.add_argument( '--no-key-edits', '-k', action='store_true', - help='only match dictionary entries if they share the same key. This drastically reduces computation.' + help='only match dictionary entries if they share the same key, drastically reducing computation; this is ' + 'equivalent to `--dict-strategy none`' ) list_edit_group = parser.add_mutually_exclusive_group() list_edit_group.add_argument( @@ -273,8 +282,22 @@ def printer_type(*pos_args, **kwargs): else: match_unless = None + if args.dict_strategy == "none": + allow_key_edits = False + auto_match_keys = False + elif args.dict_strategy == "auto": + allow_key_edits = True + auto_match_keys = True + elif args.dict_strategy == "match": + allow_key_edits = True + auto_match_keys = False + else: + allow_key_edits = not args.no_key_edits + auto_match_keys = allow_key_edits + options = graphtage.BuildOptions( - allow_key_edits=not args.no_key_edits, + allow_key_edits=allow_key_edits, + auto_match_keys=auto_match_keys, allow_list_edits=not args.no_list_edits, allow_list_edits_when_same_length=not args.no_list_edits_when_same_length ) diff --git a/graphtage/graphtage.py b/graphtage/graphtage.py index 4323d35..c9eedf7 100644 --- a/graphtage/graphtage.py +++ b/graphtage/graphtage.py @@ -338,10 +338,11 @@ def edits(self, node: TreeNode) -> Edit: class MultiSetNode(SequenceNode[HashableCounter[T]], Generic[T]): """A node representing a set that can contain duplicate items.""" - def __init__(self, items: Iterable[T]): + def __init__(self, items: Iterable[T], auto_match_keys: bool = True): if not isinstance(items, HashableCounter): items = HashableCounter(items) super().__init__(items) + self.auto_match_keys: bool = auto_match_keys def to_obj(self): return HashableCounter(n.to_obj() for n in self) @@ -357,7 +358,7 @@ def edits(self, node: TreeNode) -> Edit: elif self._children == node._children: return Match(self, node, 0) else: - return MultiSetEdit(self, node, self._children, node._children) + return MultiSetEdit(self, node, self._children, node._children, auto_match_keys=self.auto_match_keys) else: return Replace(self, node) @@ -910,6 +911,7 @@ class BuildOptions: def __init__(self, *, allow_key_edits=True, + auto_match_keys=True, allow_list_edits=True, allow_list_edits_when_same_length=True, **kwargs @@ -925,6 +927,8 @@ def __init__(self, *, """Whether to consider insert and remove edits to lists""" self.allow_list_edits_when_same_length = allow_list_edits_when_same_length """Whether to consider insert and remove edits on lists that are the same length""" + self.auto_match_keys = auto_match_keys + """Whether to automatically match key/value pairs in dictionaries if they share the same key""" for attr, value in kwargs.items(): setattr(self, attr, value) From 42b0f62d1f435706a65ee4ffa58f10a6eee6c52e Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Tue, 10 May 2022 13:17:59 -0400 Subject: [PATCH 03/12] Plumbing for honoring `--dict-strategy` --- graphtage/json.py | 9 ++++++--- graphtage/xml.py | 7 +++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/graphtage/json.py b/graphtage/json.py index 5737619..fd987ad 100644 --- a/graphtage/json.py +++ b/graphtage/json.py @@ -62,8 +62,10 @@ def build_tree( build_tree(k, options=options, force_leaf_node=True): build_tree(v, options=options) for k, v in python_obj.items() } - if options is None or options.allow_key_edits: - return DictNode.from_dict(dict_items) + if options.allow_key_edits: + dict_node = DictNode.from_dict(dict_items) + dict_node.auto_match_keys = options.auto_match_keys + return dict_node else: return FixedKeyDictNode.from_dict(dict_items) elif python_obj is None: @@ -255,7 +257,8 @@ def build_tree_handling_errors(self, path: str, options: Optional[BuildOptions] try: return self.build_tree(path=path, options=options) except json.decoder.JSONDecodeError as de: - return f'Error parsing {os.path.basename(path)}: {de.msg}: line {de.lineno}, column {de.colno} (char {de.pos})' + return f'Error parsing {os.path.basename(path)}: {de.msg}: line {de.lineno}, column {de.colno} ' \ + f'(char {de.pos})' def get_default_formatter(self) -> JSONFormatter: return JSONFormatter.DEFAULT_INSTANCE diff --git a/graphtage/xml.py b/graphtage/xml.py index eec0c8a..359b194 100644 --- a/graphtage/xml.py +++ b/graphtage/xml.py @@ -139,7 +139,8 @@ def __init__( attrib: Optional[Dict[StringNode, StringNode]] = None, text: Optional[StringNode] = None, children: Sequence['XMLElement'] = (), - allow_key_edits: bool = True + allow_key_edits: bool = True, + auto_match_keys: bool = True ): """Initializes an XML element. @@ -157,6 +158,7 @@ def __init__( attrib = {} if allow_key_edits: self.attrib: DictNode = DictNode.from_dict(attrib) + self.attrib.auto_match_keys = auto_match_keys """The attributes of this element.""" else: self.attrib = FixedKeyDictNode.from_dict(attrib) @@ -263,7 +265,8 @@ def build_tree( }, text=text, children=[build_tree(child, options) for child in root], - allow_key_edits=options is None or options.allow_key_edits + allow_key_edits=options is None or options.allow_key_edits, + auto_match_keys=options is None or options.auto_match_keys ) From a09216a28169e28b325192a9c2529b4a14137d24 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Tue, 10 May 2022 13:18:47 -0400 Subject: [PATCH 04/12] Adds an abbreviation for `--dict-strategy` --- graphtage/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphtage/__main__.py b/graphtage/__main__.py index e4270c4..f02f735 100644 --- a/graphtage/__main__.py +++ b/graphtage/__main__.py @@ -139,7 +139,7 @@ def main(argv=None) -> int: formatting.add_argument('--condensed', '-j', action='store_true', help='equivalent to `-jl -jd`') formatting.add_argument('--html', action='store_true', help='output the diff in HTML') key_match_strategy = parser.add_mutually_exclusive_group() - key_match_strategy.add_argument("--dict-strategy", choices=("auto", "match", "none"), + key_match_strategy.add_argument("--dict-strategy", "-ds", choices=("auto", "match", "none"), help="sets the strategy for matching dictionary key/value pairs: `auto` (the " "default) will automatically match two key/value pairs if they share the " "same key, but consider key edits for all non-identical keys; `match` will " From 0b8ae2554cca0825b3af2780287f6c1dce0680e6 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 08:38:41 -0400 Subject: [PATCH 05/12] Up the version number in preparation for a release --- graphtage/version.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/graphtage/version.py b/graphtage/version.py index 2f1330e..7b565fc 100644 --- a/graphtage/version.py +++ b/graphtage/version.py @@ -48,7 +48,7 @@ def git_branch() -> Optional[str]: return None -DEV_BUILD = True +DEV_BUILD = False """Sets whether this build is a development build. This should only be set to :const:`False` to coincide with a release. It should *always* be :const:`False` before @@ -59,7 +59,7 @@ def git_branch() -> Optional[str]: """ -__version__: Tuple[Union[int, str], ...] = (0, 2, 6) +__version__: Tuple[Union[int, str], ...] = (0, 2, 7) if DEV_BUILD: branch_name = git_branch() From 1e1ff719ec4ffeba99de37dd76c85cb6cfccf250 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 08:48:16 -0400 Subject: [PATCH 06/12] Replace the screenshot with syntax highlighted markdown --- README.md | 37 ++++++++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 763df42..87fe5db 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,40 @@ for semantically comparing and merging tree-like structures, such as JSON, XML, portmanteau of “graph” and “graftage”—the latter being the horticultural practice of joining two trees together such that they grow as one. -

- -

+```console +$ echo Original: && cat original.json && echo Modified: && cat modified.json +``` +```json +Original: +{ + "foo": [1, 2, 3, 4], + "bar": "testing" +} +Modified: +{ + "foo": [2, 3, 4, 5], + "zab": "testing", + "woo": ["foobar"] +} +``` +```console +$ graphtage original.json modified.json +``` +```json +{ + "z̟b̶ab̟r̶": "testing", + "foo": [ + 1̶,̶ + 2, + 3, + 4,̟ + 5̟ + ],̟ + "̟w̟o̟o̟"̟:̟ ̟[̟ + "̟f̟o̟o̟b̟a̟r̟"̟ + ]̟ +} +``` ## Installation From 282ccea46ed2d104594bb904507016ebd7eeb559 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 08:49:53 -0400 Subject: [PATCH 07/12] Bumped the version number too high! --- graphtage/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphtage/version.py b/graphtage/version.py index 7b565fc..0003bb8 100644 --- a/graphtage/version.py +++ b/graphtage/version.py @@ -59,7 +59,7 @@ def git_branch() -> Optional[str]: """ -__version__: Tuple[Union[int, str], ...] = (0, 2, 7) +__version__: Tuple[Union[int, str], ...] = (0, 2, 6) if DEV_BUILD: branch_name = git_branch() From 5ff64493c3f0dfa301022ddc28dec81bc5825892 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 08:52:48 -0400 Subject: [PATCH 08/12] Additional details about output formatting. --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 87fe5db..a34f654 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,8 @@ Graphtage performs an analysis on an intermediate representation of the trees th input files. This means, for example, that you can diff a JSON file against a YAML file. Also, the output format can be different from the input format(s). By default, Graphtage will format the output diff in the same file format as the first input file. But one could, for example, diff two JSON files and format the output in YAML. There are several -command-line arguments to specify these transformations; please check the `--help` output for more information. +command-line arguments to specify these transformations, such as `--format`; please check the `--help` output for more +information. By default, Graphtage pretty-prints its output with as many line breaks and indents as possible. ```json From df7fda5d1f1846512b7b6b45e7601de02c24a0d6 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 11:04:34 -0400 Subject: [PATCH 09/12] Document the new matching options --- README.md | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a34f654..1b0a9a5 100644 --- a/README.md +++ b/README.md @@ -94,12 +94,23 @@ Use `--condensed` or `-j` to apply both of these options: The `--only-edits` or `-e` option will print out a list of edits rather than applying them to the input file in place. ### Matching Options -By default, Graphtage tries to match all possible pairs of elements in a dictionary. While computationally tractable, -this can sometimes be onerous for input files with huge dictionaries. The `--no-key-edits` or `-k` option will instead -only attempt to match dictionary items that share the same key, drastically reducing computation. Likewise, the -`--no-list-edits` or `-l` option will not consider interstitial insertions and removals when comparing two lists. The -`--no-list-edits-when-same-length` or `-ll` option is a less drastic version of `-l` that will behave normally for lists -that are of different lengths but behave like `-l` for lists that are of the same length. +By default, Graphtage tries to match all possible pairs of elements in a dictionary. + +Matching two dictionaries with each other is hard. Although computationally tractable, this can sometimes be onerous for +input files with huge dictionaries. Graphtage has three different strategies for matching dictionaries: +1. `--dict-strategy match` (the most computationally expensive) tries to match all pairs of keys and values between the + two dictionaries, resulting in a match of minimum edit distance; +2. `--dict-strategy none` (the least computationally expensive) will not attempt to match any key/value pairs unless + they have the exact same key; and +3. `--dict-strategy auto` (the default) will automatically match the values of any key-value pairs that have identical + keys and then use the `match` strategy for the remainder of key/value pairs. + +See [Pull Request #51](https://github.com/trailofbits/graphtage/pull/51) for some examples of how these strategies +affect output. + +The `--no-list-edits` or `-l` option will not consider interstitial insertions and removals when comparing two lists. +The `--no-list-edits-when-same-length` or `-ll` option is a less drastic version of `-l` that will behave normally for +lists that are of different lengths but behave like `-l` for lists that are of the same length. ### ANSI Color By default, Graphtage will only use ANSI color in its output if it is run from a TTY. If, for example, you would like From 7d67a426e7d5e17a82f9198ffca893d905e37a1d Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 11:05:14 -0400 Subject: [PATCH 10/12] Adds the new version to the documentation template --- docs/_templates/layout.html | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/_templates/layout.html b/docs/_templates/layout.html index d23a622..4f6716c 100644 --- a/docs/_templates/layout.html +++ b/docs/_templates/layout.html @@ -17,6 +17,7 @@ {% endfor %} {% else %}
latest
+
0.2.6
0.2.5
0.2.4
0.2.3
From 4a644dc3093e7a58fd32b983293da39047742617 Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Fri, 13 May 2022 11:39:36 -0400 Subject: [PATCH 11/12] Test against Python 3.10 in CI --- .github/workflows/pythonpackage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pythonpackage.yml b/.github/workflows/pythonpackage.yml index d9c659f..f02e9df 100644 --- a/.github/workflows/pythonpackage.yml +++ b/.github/workflows/pythonpackage.yml @@ -15,7 +15,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.6, 3.7, 3.8, 3.9] + python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] steps: - uses: actions/checkout@v2 From 8ec008fb8e44613f7af3dd002bf9da04c2621a0e Mon Sep 17 00:00:00 2001 From: Evan Sultanik Date: Mon, 16 May 2022 12:31:50 -0400 Subject: [PATCH 12/12] Keep this version a dev build --- graphtage/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/graphtage/version.py b/graphtage/version.py index 0003bb8..2f1330e 100644 --- a/graphtage/version.py +++ b/graphtage/version.py @@ -48,7 +48,7 @@ def git_branch() -> Optional[str]: return None -DEV_BUILD = False +DEV_BUILD = True """Sets whether this build is a development build. This should only be set to :const:`False` to coincide with a release. It should *always* be :const:`False` before