fix: faster stream verification #76

H4ad · 2023-04-04T15:14:32Z

Faster integrity check when is stream

I also take a look at streams mode because PNPM also verify the integrity of the files using streams.

The initial version was already fast compare to the main:

sri.fromStream(stream, largeIntegrity) x 145 ops/sec ±1.58% (78 runs sampled)
ssri.fromStream(stream, tinyIntegrity) x 9,508 ops/sec ±2.53% (76 runs sampled)
ssri.checkStream(stream, largeIntegrity) x 153 ops/sec ±0.92% (78 runs sampled)
ssri.checkStream(stream, tinyIntegrity) x 9,055 ops/sec ±1.81% (80 runs sampled)

I also saw that checkStream doesn't support the option single and almost all verifications that are done by PNPM only verify a single hash, so I see an opportunity to push the performance a little bit further.

ssri.fromStream(stream, largeIntegrity) x 147 ops/sec ±1.76% (76 runs sampled)
ssri.fromStream(stream, tinyIntegrity) x 10,339 ops/sec ±2.71% (80 runs sampled)
ssri.checkStream(stream, largeIntegrity) x 152 ops/sec ±1.14% (80 runs sampled)
ssri.checkStream(stream, tinyIntegrity) x 10,023 ops/sec ±1.46% (81 runs sampled)

ssri.checkStream(stream, largeIntegrity, { single: true }) x 151 ops/sec ±1.19% (79 runs sampled)
ssri.checkStream(stream, tinyIntegrity, { single: true }) x 10,278 ops/sec ±1.35% (81 runs sampled)

But I did an experiment, If we ignore all the checkStream codes and jump to the final verification, we can achieve this performance:

ssri + createHash (largeIntegrity) x 318 ops/sec ±1.52% (81 runs sampled)
ssri + createHash (tinyIntegrity) x 15,863 ops/sec ±1.70% (81 runs sampled)

I put the code in the file above, the assumption is: if we verify only one hash, we can skip a lot of verifications.
So I think I could be good to ssri to export single hash verifications, what do you think?

benchmark-stream.js

const Benchmark = require('benchmark');
// const wtf = require("wtfnode");
// wtf.init();
const ssri = require('./lib/index');
const suite = new Benchmark.Suite();
const fs = require('fs');
const crypto = require('crypto');
const { Readable } = require('stream');

const largeText = 'a'.repeat(64).repeat(100);
const largeTextSplitted = largeText.split('');

const tinyText = 'a'.repeat(64);
const tinyTextSplitted = tinyText.split('');

const getStream = (text) => Readable.from(text);

function hash(data, algorithm) {
  return crypto.createHash(algorithm).update(data).digest('base64');
}

const largeIntegrity = `sha512-${hash(largeText, 'sha512')}`;
const tinyIntegrity = `sha512-${hash(tinyText, 'sha512')}`;

suite
  .add('ssri.fromStream(stream, largeIntegrity)', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(largeTextSplitted);

      ssri.fromStream(stream, largeIntegrity).then(() => {
        deferred.resolve();
      });
    },
  })
  .add('ssri.fromStream(stream, tinyIntegrity)', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(tinyTextSplitted);

      ssri.fromStream(stream, tinyIntegrity).then(() => {
        deferred.resolve();
      });
    },
  })
  .add('ssri.checkStream(stream, largeIntegrity)', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(largeTextSplitted);

      ssri.checkStream(stream, largeIntegrity).then(() => {
        deferred.resolve();
      });
    },
  })
  .add('ssri.checkStream(stream, tinyIntegrity)', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(tinyTextSplitted);

      ssri.checkStream(stream, tinyIntegrity).then(() => {
        deferred.resolve();
      });
    },
  })
  .add('ssri.checkStream(stream, largeIntegrity, { single: true })', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(largeTextSplitted);

      ssri.checkStream(stream, largeIntegrity, { single: true }).then(() => {
        deferred.resolve();
      });
    },
  })
  .add('ssri.checkStream(stream, tinyIntegrity, { single: true })', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(tinyTextSplitted);

      ssri.checkStream(stream, tinyIntegrity, { single: true }).then(() => {
        deferred.resolve();
      });
    },
  })
  .add('ssri + createHash (largeIntegrity)', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(largeTextSplitted);
      const parsed = ssri.parse(largeIntegrity, { single: true });
      const hash = crypto.createHash(parsed.algorithm);

      stream.pipe(hash);
      stream.on('end', () => {
        const digest = hash.digest('base64');

        if (parsed.digest !== digest) {
          throw new Error('Integrity check failed');
        }
        deferred.resolve();
      });
    },
  })
  .add('ssri + createHash (tinyIntegrity)', {
    defer: true,
    fn: function (deferred) {
      const stream = getStream(tinyTextSplitted);
      const parsed = ssri.parse(tinyIntegrity, { single: true });
      const hash = crypto.createHash(parsed.algorithm);

      stream.pipe(hash);
      stream.on('end', () => {
        const digest = hash.digest('base64');

        if (parsed.digest !== digest) {
          throw new Error('Integrity check failed');
        }
        deferred.resolve();
      });
    },
  })
  .on('cycle', function (event) {
    console.log(String(event.target));
    // wtf.dump();
  })
  .run({ async: false });

References

Related to #71

lib/index.js

wraithgar · 2023-04-04T16:20:31Z

lib/index.js

-    this.algorithm = this.goodSri ? this.sri.pickAlgorithm(this.opts) : null
+    this.goodSri = this.sri instanceof Integrity
+      ? !!Object.keys(this.sri).length
+      : this.sri instanceof Hash


What use case is this?

This line: https://github.com/pnpm/pnpm/blob/ef6c22e129dc3d76998cee33647b70a66d1f36bf/fetching/tarball-fetcher/src/remoteTarballFetcher.ts#L206

From what I see from the NPM data, the integrity usually is a 256 hash, and it only is one hash.

So, instead of needing to instantiate an Integrity class, we can just instantiate the Hash class and save some operations when single=true.

But thinking a little further, instead of supporting this behavior, maybe we can create a new API specifically to verify only one hash at a time like I put in the benchmark (which gives us almost 2x perf), intended for cases like this.

What do you think? Keep or create a new API?

We can also keep both since checkStream today breaks when we pass the single=true option, so it's not bad behavior at all.

Ok I'm following you now. The single: true option is already the signal we use to inform ssri that we are only parsing a single hash. Having checkStream not break with this param seems preferable to creating a whole new single hash API. Once that works in and of itself we can then optimize it as we would have new API.

Does this make sense? Feel free to advocate for your own opinion here, it is important.

I think you are right, my initial idea with the new API was to avoid the creation of IntegrityStream class and skip a bunch of verifications.

But actually, maybe we can do as you said and use single: true to perform those optimizations.

lib/index.js

wraithgar · 2023-04-04T16:43:43Z

The nested ternaries in #getOptions are extremely hard to follow, can we take this opportunity to remove them?

    this.algorithm = null
    if (this.sri instanceof Integrity) {
      this.goodSri = !!Object.keys(this.sri).length
      if (this.goodSri) {
        this.algorithm = this.sri.pickAlgorithm(this.opts)
      }
    } else if (this.sri instanceof Hash) {
      this.goodSri = this.sri
      this.algorithm = this.sri.algorithm
    }

lib/index.js

H4ad · 2023-04-04T18:12:39Z

I was reading the code, check if I'm not wrong, this piece of code:

ssri/lib/index.js

Lines 82 to 86 in 8e80eca

    
           const newSri = parse(this.hashes.map((h, i) => { 
        
             return `${this.algorithms[i]}-${h.digest('base64')}${this.optString}` 
        
           }).join(' '), this.opts) 
        
           // Integrity verification mode 
        
           const match = this.goodSri && newSri.match(this.sri, this.opts)

And then, the implementation of match is:

ssri/lib/index.js

Lines 283 to 298 in 8e80eca

    
           match (integrity, opts) { 
        
             const other = parse(integrity, opts) 
        
             if (!other) { 
        
               return false 
        
             } 
        
             const algo = other.pickAlgorithm(opts) 
        
             return ( 
        
               this[algo] && 
        
               other[algo] && 
        
               this[algo].find(hash => 
        
                 other[algo].find(otherhash => 
        
                   hash.digest === otherhash.digest 
        
                 ) 
        
               ) 
        
             ) || false 
        
           }

What this code is doing:

Hashing the stream with all the algorithms (from opts or default 512).
Then, getting the digest and parsing it again to transform to Integrity or Hash.
Then, picking just one of the algorithms (basically wasting the time spent to parse it because we just need one).
And finally, we compare the digest (that we already had)

If the assumptions are:

We need to compare all the hashes and make sure all match.

So, the code is not implemented correctly.

If the assumptions are:

We need to compare at least one hash.

So, the code is doing more than needed, we could just parse the sent digest, pick one, and then parse the stream for just that hash.

In the final, I think we could remove entirely the function of pickAlgorithm because the assumption behind it is wrong.

wraithgar · 2023-04-04T18:35:19Z

pickAlgorithm is a very very useful function still. Our package fetching library and caching library use it to pull out the "best" algorithm to use when reading fetching/reading data.

Reusing it in match I think was where we went wrong.

As far as what match should be doing, I think I need to think through a boolean chart.

Integrity can have one hash or multiple hashes, sri can have one hash or multiple hashes.

Integrity.match(sri):

(one hash).match(one hash) - Hash algorithms should match, and digests should match.
(one hash).match(multiple hashes) - Integrity hash algorithm should be found in sri hashes, and the digests of that hash should match.
(multiple hashes).match(one hash) - sri hash algorithm should be found in Integrity hashes, and the digests of that hash should match.
(multiple hashes).match(multiple hashes) - Either we find the "best" algorithm of any matching hashes, and the digests of that one hash should match, or we find all matching algorithms between the Integrity and sri, and all of the digests should match.

H4ad · 2023-04-04T19:10:25Z

@wraithgar Okay, I'll look into this more over the weekend to see if I can get more optimizations out of this.

But those optimizations will be along with the match fix, so I guess we could separate it into another PR.

About this PR, the only thing missing is the tests, right? For the bug on match we will just forward it to another PR and keep it as far as I can understand.

wraithgar · 2023-04-04T19:44:03Z

Yes by all means let's fix match in a separate PR.

wraithgar · 2023-04-06T15:22:10Z

Pinging "Always happy to be pinged" @jakebailey. Not sure if your interests are limited to semver or if they expand to other libs used by pnpm.

Two PRs have already landed here and were published as v10.0.2. I wouldn't bother submitting an update to pnpm quite yet because this PR is likely going to go out sooner than later also.

If you are involved in updating ssri in pnpm please note that toString() is no longer returning identical strings as before. Under the spec this doesn't matter, and as far as I can tell pnpm is not making that string do any heavy lifting on its own, but it's something to be aware of and definitely something we can revisit if it ends up being a problem for pnpm (or yarn).

jakebailey · 2023-04-06T16:57:29Z

In my real-world DT case, it doesn't seem to really do much. If anything, 10.0.2 itself is a little slower? Hard to say; there is variance here.

If you are involved in updating ssri in pnpm please note that toString() is no longer returning identical strings as before. Under the spec this doesn't matter, and as far as I can tell pnpm is not making that string do any heavy lifting on its own, but it's something to be aware of and definitely something we can revisit if it ends up being a problem for pnpm (or yarn).

What do you mean by "no longer returning identical strings"? That sort of thing will have an impact for lockfiles; IIRC pnpm's does contain this string, so it probably will be a problem because two different versions of pnpm which are supposed to use the same lockfile format will instead not agree on the integrety. So, hopefully that's not what you mean.

H4ad · 2023-04-06T17:06:54Z

@jakebailey The PR that changes the behavior of toString is this one: #75

Essentially, if the hash was created from a string like sha256-xxx sha512-xxx, the output when calling parsed.toString() is sha512-xxx sha-256xxx only when strict=true.

The only time PNPM call that method is in this place: https://github.com/search?q=repo%3Apnpm%2Fpnpm%20ssri.parse&type=code

So I think it will not be a problem for PNPM, for other package managers, I don't know if could be a problem because usually the integrity of the packages is just stored using a single algorithm, this little breaking change only affects when we see more than one algorithm with strict=true.

wraithgar · 2023-04-06T17:34:29Z

Yes that's correct. As long as folks are using the literal stringified representation to test equality w/ existing integrities but are instead letting ssri parse it they will be fine.

jakebailey · 2023-04-06T17:39:39Z

Sure, I'll defer to you all and @zkochan; my knowledge of this is limited to "I profiled the code and made it faster" 😄

wraithgar · 2023-04-10T18:28:20Z

@H4ad I think linting is all we're waiting on here. The .match bugfix is gonna take @nlf and I some digging to sort out.

wraithgar · 2023-04-10T18:34:05Z

lib/index.js

+  match (integrity, opts) {
+    const other = parse(integrity, opts)
+    if (!other) {
+      return false


looks like we need a test that hits this line.

This will take more time to add, I will try to add more test tomorrow.

No worries! Really appreciate your contributions lately.

I push the tests, I also changed a little bit the implementation because I was doing the wrong verification to compare the integrity, tests are always good ahhaaha

In fact, I still have some lines without coverage, but I'm not sure how I can properly test these lines

I need to tackle the match algorithm anyways. I'll fork from this branch to fix it and when it lands it'll land this also. Test coverage can be our problem.

wraithgar

Approved pending test coverage, to be done in a new PR that builds off of this.

wraithgar · 2023-04-11T18:34:10Z

This landed in #79!

H4ad requested a review from a team as a code owner April 4, 2023 15:14

H4ad requested a review from wraithgar April 4, 2023 15:14

wraithgar reviewed Apr 4, 2023

View reviewed changes

lib/index.js Outdated Show resolved Hide resolved

H4ad force-pushed the fix/faster-stream-integrity branch from afba080 to a70f2bb Compare April 4, 2023 16:15

wraithgar reviewed Apr 4, 2023

View reviewed changes

lib/index.js Show resolved Hide resolved

H4ad force-pushed the fix/faster-stream-integrity branch from a70f2bb to 00ce92e Compare April 4, 2023 16:59

wraithgar reviewed Apr 4, 2023

View reviewed changes

lib/index.js Outdated Show resolved Hide resolved

H4ad force-pushed the fix/faster-stream-integrity branch from 00ce92e to 5aab2dc Compare April 4, 2023 17:53

wraithgar mentioned this pull request Apr 4, 2023

chore: release 10.0.2 #73

Merged

H4ad force-pushed the fix/faster-stream-integrity branch from 5aab2dc to 2acfc3a Compare April 10, 2023 18:30

wraithgar reviewed Apr 10, 2023

View reviewed changes

H4ad force-pushed the fix/faster-stream-integrity branch 2 times, most recently from b382e86 to 6b4237f Compare April 11, 2023 11:23

perf: faster stream verification

fbe3f6c

H4ad force-pushed the fix/faster-stream-integrity branch from 6b4237f to fbe3f6c Compare April 11, 2023 11:32

wraithgar approved these changes Apr 11, 2023

View reviewed changes

wraithgar mentioned this pull request Apr 11, 2023

Integrity#match bugfix and other optimizations #79

Merged

wraithgar closed this Apr 11, 2023

wraithgar mentioned this pull request Mar 26, 2024

cli: implement node --run <script-in-package-json> nodejs/node#52190

Merged

4 tasks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

fix: faster stream verification #76

fix: faster stream verification #76

H4ad commented Apr 4, 2023

wraithgar Apr 4, 2023

H4ad Apr 4, 2023 •

edited

Loading

wraithgar Apr 4, 2023

H4ad Apr 4, 2023

wraithgar commented Apr 4, 2023

H4ad commented Apr 4, 2023

wraithgar commented Apr 4, 2023

H4ad commented Apr 4, 2023

wraithgar commented Apr 4, 2023

wraithgar commented Apr 6, 2023

jakebailey commented Apr 6, 2023

H4ad commented Apr 6, 2023

wraithgar commented Apr 6, 2023 •

edited

Loading

jakebailey commented Apr 6, 2023

wraithgar commented Apr 10, 2023

wraithgar Apr 10, 2023

H4ad Apr 10, 2023

wraithgar Apr 10, 2023

H4ad Apr 11, 2023

H4ad Apr 11, 2023 •

edited

Loading

wraithgar Apr 11, 2023

wraithgar left a comment

wraithgar commented Apr 11, 2023

fix: faster stream verification #76

fix: faster stream verification #76

Conversation

H4ad commented Apr 4, 2023

Faster integrity check when is stream

References

Choose a reason for hiding this comment

H4ad Apr 4, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

wraithgar commented Apr 4, 2023

H4ad commented Apr 4, 2023

wraithgar commented Apr 4, 2023

H4ad commented Apr 4, 2023

wraithgar commented Apr 4, 2023

wraithgar commented Apr 6, 2023

jakebailey commented Apr 6, 2023

H4ad commented Apr 6, 2023

wraithgar commented Apr 6, 2023 • edited Loading

jakebailey commented Apr 6, 2023

wraithgar commented Apr 10, 2023

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

H4ad Apr 11, 2023 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

wraithgar left a comment

Choose a reason for hiding this comment

wraithgar commented Apr 11, 2023

H4ad Apr 4, 2023 •

edited

Loading

wraithgar commented Apr 6, 2023 •

edited

Loading

H4ad Apr 11, 2023 •

edited

Loading