From ba43c4fd2bf83c360952c6454a6978e30f561969 Mon Sep 17 00:00:00 2001 From: Chiro Hiro Date: Tue, 23 May 2023 14:42:01 +0700 Subject: [PATCH 1/4] Remove unnecessary files that belong to old implementation of js-ipfs storage --- zkdb/src/storage/index.ts | 3 - zkdb/src/storage/ipfs-storage.ts | 192 ------------------------------- zkdb/src/storage/storage-test.ts | 24 ---- 3 files changed, 219 deletions(-) delete mode 100644 zkdb/src/storage/index.ts delete mode 100644 zkdb/src/storage/ipfs-storage.ts delete mode 100644 zkdb/src/storage/storage-test.ts diff --git a/zkdb/src/storage/index.ts b/zkdb/src/storage/index.ts deleted file mode 100644 index 6d8b97a6..00000000 --- a/zkdb/src/storage/index.ts +++ /dev/null @@ -1,3 +0,0 @@ -import { IPFSStorage, IPFSStorageConfiguration } from "./ipfs-storage"; - -export { IPFSStorage, IPFSStorageConfiguration } \ No newline at end of file diff --git a/zkdb/src/storage/ipfs-storage.ts b/zkdb/src/storage/ipfs-storage.ts deleted file mode 100644 index d4c03431..00000000 --- a/zkdb/src/storage/ipfs-storage.ts +++ /dev/null @@ -1,192 +0,0 @@ -import { CID } from 'multiformats'; -import * as IPFS from 'ipfs-core'; -import { PutOptions } from 'ipfs-core-types/src/dag'; -import { Poseidon, Encoding } from 'snarkyjs'; - -export function convertHexToUint8Array(hexString: string): Uint8Array { - const hex = hexString - .replace(/^0x/i, '') - .padStart(hexString.length + (hexString.length % 2), '0'); - const result = new Uint8Array(hex.length / 2); - - let j = 0; - for (let i = 0; i < result.length; i += 1) { - j = i * 2; - result[i] = parseInt(hex.substring(j, j + 2), 16); - } - - return result; -} - -export interface IPFSStorageConfiguration { - database: string; -} - -export class IPFSStorage { - private config: IPFSStorageConfiguration; - - private collections: any = {}; - - private nodeInstance: IPFS.IPFS; - - constructor( - IPFSNodeInstance: IPFS.IPFS, - config?: Partial - ) { - this.config = { ...this.config, ...config }; - this.nodeInstance = IPFSNodeInstance; - } - - private get databasePath(): string { - return `/${this.config.database}`; - } - - private get metadataPath(): string { - return `${this.databasePath}/metadata.zkdb`; - } - - private getCollectionPath(collection: string): string { - return `${this.databasePath}/${collection}.json`; - } - - private async poseidonHash(document: any): Promise { - const encoder = new TextEncoder(); - - const doc = encoder.encode(JSON.stringify(document)) - - // Calculate poseidon hash of document - const hexDigest = convertHexToUint8Array( - Poseidon.hash(Encoding.Bijective.Fp.fromBytes(doc)).toString() - ); - - return (await this.nodeInstance.bases.getBase('base32')).encoder - .encode(hexDigest) - .toString(); - } - - private async isExist(path: string, filename: string) { - let status = false; - const fullPath = path + '/' + filename - try { - const fileStatus = await this.nodeInstance.files.stat(fullPath); - if (fileStatus) { - status = true; - } - } catch (e) { - let message = 'Unknown Error' - if (e instanceof Error) message = e.message - console.log(message) - status = false; - } - return status - } - - private async readFile(filename: string): Promise { - let chunks = []; - let length = 0; - for await (const chunk of this.nodeInstance.files.read(filename)) { - chunks.push(chunk); - length += chunk.length; - } - let data = new Uint8Array(length); - length = 0; - for (let i = 0; i < chunks.length; i += 1) { - data.set(chunks[i], length); - length += chunks[i].length; - } - return data; - } - - private async readJSON(filename: string): Promise { - let data = ''; - let decoder = new TextDecoder(); - - for await (const chunk of this.nodeInstance.files.read(filename)) { - data += decoder.decode(chunk); - } - return JSON.parse(data); - } - - private async loadDatabase() { - if (!(await this.isExist('/', this.config.database))) { - await this.nodeInstance.files.mkdir(this.databasePath); - } - } - - private async loadCollection(collection: string) { - if (!(await this.isExist(this.databasePath, `${collection}.json`))) { - // Create metadata file for zkDatabase - await this.nodeInstance.files.touch(this.getCollectionPath(collection)); - // Write {} to the file - await this.nodeInstance.files.write( - this.getCollectionPath(collection), - new Uint8Array([123, 125]) - ); - } - - this.collections[collection] = await this.readJSON( - this.getCollectionPath(collection) - ); - } - - public static async init( - config?: IPFSStorageConfiguration - ): Promise { - const instance = new IPFSStorage( - await IPFS.create({ peerStoreCacheSize: 10 }), - config - ); - await instance.loadDatabase(); - return instance; - } - - public async put(collection: string, document: T, option?: PutOptions) { - await this.loadCollection(collection); - let documentDigest = await this.poseidonHash(document); - - const result = await this.nodeInstance.dag.put(document, { - pin: true, - ...option, - }); - - const cid = result.toString(); - this.collections[collection][documentDigest] = cid; - - await this.nodeInstance.files.write( - this.getCollectionPath(collection), - JSON.stringify(this.collections[collection]) - ); - - return { - CID: cid, - documentID: documentDigest, - timestamp: Date.now(), - database: this.config.database, - collection, - document, - }; - } - - public async get(collection: string, documentID: string) { - if ( - typeof this.collections[collection] === 'undefined' || - typeof this.collections[collection][documentID] === 'undefined' - ) { - await this.loadCollection(collection); - } - - if ( - typeof this.collections[collection] !== 'undefined' && - typeof this.collections[collection][documentID] !== 'undefined' - ) { - const cid = CID.parse(this.collections[collection][documentID]); - const DAGResult = await this.nodeInstance.dag.get(cid); - return { - CID: cid.toString(), - documentID: documentID, - ...DAGResult, - }; - } - return undefined; - } -} diff --git a/zkdb/src/storage/storage-test.ts b/zkdb/src/storage/storage-test.ts deleted file mode 100644 index 135797da..00000000 --- a/zkdb/src/storage/storage-test.ts +++ /dev/null @@ -1,24 +0,0 @@ -import { IPFSStorage, IPFSStorageConfiguration } from "./ipfs-storage.js"; - -async function run() { - const config: IPFSStorageConfiguration = { database: "TestDatabase" } - - const storage = await IPFSStorage.init(config); - - const obj = { - employees: [ - { firstName: "John", lastName: "Doe" }, - { firstName: "Anna", lastName: "Smith" }, - { firstName: "Peter", lastName: "Jones" }, - ], - }; - - const result = await storage.put("TestFile", obj, { pin: true }); - console.log(result) - - const getTestResult = await storage.get("TestFile", result.documentID) - console.log(getTestResult) - console.log(getTestResult?.value) -} - -run() From e26e89c0d814a2efe089231471744653401bf143 Mon Sep 17 00:00:00 2001 From: Chiro Hiro Date: Tue, 23 May 2023 14:45:09 +0700 Subject: [PATCH 2/4] Add specification for ipfs and ipfs storage engine --- specs/docs/distributed-storage-engine/ipfs.md | 93 +++++++++++++++++++ .../storage-engine.md | 74 +++++++++++++++ 2 files changed, 167 insertions(+) create mode 100644 specs/docs/distributed-storage-engine/ipfs.md create mode 100644 specs/docs/distributed-storage-engine/storage-engine.md diff --git a/specs/docs/distributed-storage-engine/ipfs.md b/specs/docs/distributed-storage-engine/ipfs.md new file mode 100644 index 00000000..a287991b --- /dev/null +++ b/specs/docs/distributed-storage-engine/ipfs.md @@ -0,0 +1,93 @@ +# IPFS + +IPFS is a distributed protocol that allow you to replicate data among network, you can put a data to IPFS and get those data back as long as it wasn't run out of liveness. + +## CID + +CID is a unique fingerprint of data you can access the data as long as you know the exactly CID. The CID was calculated by hash function but it isn't data's digest. Instead the CID was calculated by digests of blocks of data. + +Combining that hash with codec information about the block using multiformats: + +- Multihash for information on the algorithm used to hash the data. +- Multicodec for information on how to interpret the hashed data after it has been fetched. +- Multibase for information on how the hashed data is encoded. Multibase is only used in the string representation of the CID. + +In our implementation we use CID v1 and use `SHA256` + `base58`. I supposed that `poseidon` could be better in the long term so we need to make a poseidon proposal to `multihash`. + +## IPNS + +As we know from above, each DAG node is immutable. In the reality, we want to keep the pointer to the data immutable. [IPNS](https://docs.ipfs.tech/concepts/ipns/) will solve this by provide a permanently pointer (in fact it's a hash of public key). + +## Merkle DAG + +A Merkle DAG is a DAG where each node has an identifier, and this is the result of hashing the node's contents — any opaque payload carried by the node and the list of identifiers of its children — using a cryptographic hash function like SHA256. This brings some important considerations. + +Our data will be stored in sub-merkle DAG. Every time we alter a leaf, it's also change the sub-merkle DAG node and it's required to recompute the CID, this will impact our implementation since we need a metadata file to keep track on CIDs and its children. + +## Javascript IPFS + +[js-ipfs](https://github.com/ipfs/js-ipfs) paves the way for the Browser implementation of the IPFS protocol. Written entirely in JavaScript, it runs in a Browser, a Service Worker, a Web Extension and Node.js, opening the door to a world of possibilities. Our zkDatabase utilize this package to provide accessibility to data. + +We switch to [Helia](https://github.com/ipfs/helia) due to the `js-ipfs` is discontinued. + +## libp2p + +LibP2p provide building blocks to build p2p application, it handled all p2p network related along side with its modules. It's flexible to use and develop with [libp2p](https://github.com/libp2p/js-libp2p). To config and work with libp2p you need to define: + +- Transport: [TCP](https://github.com/libp2p/js-libp2p-tcp), [WebSockets](https://github.com/libp2p/js-libp2p-websockets) these two transports handle the connect in different way. TCP allowed you to handle connect natively but it's required to use `Node.js` run-time instead of browser based. WebSockets module work for both with the lesser performance. +- Encryption: [noise](https://github.com/ChainSafe/js-libp2p-noise), we don't have any option since TLS didn't have any implement for JS. +- Multiplexer: We have two options [mplex](https://github.com/libp2p/js-libp2p-mplex) and [yamux](https://github.com/ChainSafe/js-libp2p-yamux). Multiplexer improve the performance of protocol, node handling. `mplex` is preferable for `tcp` meanwhile `yamux` is prefer for `WebSockets`. +- Node discovery: [KAD DHT](https://github.com/libp2p/js-libp2p-kad-dht) I'm prefer this library to handle node discovery and routing. I'm try the `bootstrap` but it isn't working well that's why you would see I connect the bootstrap nodes directly in the construction. + +```ts +const nodeP2p = await createLibp2p(config); +// Manual patch for node bootstrap +const addresses = [ + "/dnsaddr/bootstrap.libp2p.io/p2p/QmNnooDu7bfjPFoTZYxMNLWUQJyrVwtbZg5gBMjTezGAJN", + "/dnsaddr/bootstrap.libp2p.io/p2p/QmQCU2EcMqAqQPR2i9bChDtGNJchTbq5TbXJJ16u19uLTa", + "/dnsaddr/bootstrap.libp2p.io/p2p/QmbLHAnMoJPWSCR5Zhtx6BHJX9KiKNN6tpvbUcqanj75Nb", + "/dnsaddr/bootstrap.libp2p.io/p2p/QmcZf59bWwK5XFi76CZX8cbJ4BhTzzA3gU1ZjYZcYW3dwt", +].map((e) => multiaddr(e)); +for (let i = 0; i < addresses.length; i += 1) { + await nodeP2p.dial(addresses[i]); +} +await nodeP2p.start(); +``` + +## Helia + +[Helia](https://github.com/ipfs/helia) is an new project that handle `ipfs` in modular manner. You can construct a new instance of `Helia` on top of libp2p. + +```ts +return createHelia({ + blockstore: new FsBlockstore( + ((storage)).location + ), + libp2p, +}); +``` + +By passing libp2p instance to Helia, it's highly configurable. + +## UnixFS + +To handle file I/O, we used [UnixFS](https://github.com/ipfs/helia-unixfs). It can be constructed in the same way that we did with `Helia` but it will take a `Helia` instance instead of `libp2p`. + +```ts +const fs = unixfs(heliaNode); +let text = ""; +const decoder = new TextDecoder(); + +let testCID = CID.parse("QmdASJKc1koDd9YczZwAbYWzUKbJU73g6YcxCnDzgxWtp3"); +if (testCID) { + console.log("Read:", testCID); + for await (const chunk of fs.cat(testCID)) { + text += decoder.decode(chunk, { + stream: true, + }); + } + console.log(text); +} +``` + +After do research in `libp2p` and `ipfs` we introduce `StorageEngineIPFS` that handle `ipfs` I/O. The detail is given in [specs](./storage-engine.md). In our implementation, we used `datastore-fs` and `blockstore-fs` to persist changes. diff --git a/specs/docs/distributed-storage-engine/storage-engine.md b/specs/docs/distributed-storage-engine/storage-engine.md new file mode 100644 index 00000000..fdb429ca --- /dev/null +++ b/specs/docs/distributed-storage-engine/storage-engine.md @@ -0,0 +1,74 @@ +## Storage Engine + +Storage Engine help us to handle file storage and local catching process, storage engine is also help to index files for further accession. + +### IPFS Storage Engine + +IPFS Storage Engine is a distributed storage engine based on [IPFS](https://ipfs.tech/). The `StorageEngineIPFS` ins an implementation of `IFileSystem` and `IFileIndex` that handle all I/O operations and indexing. + +```ts +/** + * An interface of file engine, depend on the environment + * file engine could be different + */ +export interface IFileSystem { + writeBytes(_data: R): Promise; + write(_filename: S, _data: R): Promise; + read(_filename: S): Promise; + remove(_filename: S): Promise; +} + +/** + * Method that performing index and lookup file + */ +export interface IFileIndex { + publish(_contentID: T): Promise; + republish(): void; + resolve(_peerID?: S): Promise; +} + +/** + * IPFS file system + */ + +export type TIPFSFileSystem = IFileSystem; + +/** + * IPFS file index + */ +export type TIPFSFileIndex = IFileIndex; +``` + +The relationship between `StorageEngineIPFS` and other classes/interfaces is shown below: + +```mermaid +classDiagram + LibP2pNode -- StorageEngineIPFS + Helia-- StorageEngineIPFS + UnixFS -- StorageEngineIPFS + IPNS -- StorageEngineIPFS + IFileSystem <|-- StorageEngineIPFS + IFileIndex <|-- StorageEngineIPFS + IFileSystem : writeByte(data Uint8Array) CID + IFileSystem : write(filename string, data Uint8Array) CID + IFileSystem : read(filename string) Uint8Array + IFileSystem : remove(filename string) boolean + IFileIndex : publish(contentID CID) IPNSEntry + IFileIndex : republish() void + IFileIndex : resolve(peerID PeerId) CID + StorageEngineIPFS : static getInstance(basePath, config) +``` + +In our implementation, we used `datastore-fs` and `blockstore-fs` to persist changes with local file, for now browser is lack of performance to handle connections and I/O. So the best possible solution is provide a local node that handle all I/O and connection. + +#### File mutability + +Since a DAG nodes are immutable but we unable to update the `CID` every time. So `IPNS` was used, `IPNS` create a record that mapped a `CID` to a `PeerID` hence the `PeerID` is unchanged, so as long as we keep the `IPNSEntry` update other people could get the `CID` of the zkDatabase. + +#### Metadata + +The medata file is holding a mapping of data's poseidon hash to its `CID` that allowed us to retrieve the data from ipfs. It's also use to reconstruct the merkle tree. + +#### BSON Document + +BSON or Binnary JSON is a data structure that we used to encode and decode document. The document will be categorized into collections. From 66d48284587da2b591b012998227ad9d05e6f2ea Mon Sep 17 00:00:00 2001 From: Chiro Hiro Date: Wed, 24 May 2023 10:56:12 +0700 Subject: [PATCH 3/4] Update document and clarify follow suggest for robin --- specs/docs/distributed-storage-engine/ipfs.md | 33 ++++++++++++++----- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/specs/docs/distributed-storage-engine/ipfs.md b/specs/docs/distributed-storage-engine/ipfs.md index a287991b..380abd6c 100644 --- a/specs/docs/distributed-storage-engine/ipfs.md +++ b/specs/docs/distributed-storage-engine/ipfs.md @@ -1,12 +1,21 @@ # IPFS -IPFS is a distributed protocol that allow you to replicate data among network, you can put a data to IPFS and get those data back as long as it wasn't run out of liveness. +IPFS is a distributed protocol that allow you to replicate data among network, you can put a data to IPFS and get those data back as long as it wasn't run out of liveness. Data will be stored as blocks and each block will be identified by its digest. + +## PeerID + +PeerID is a unique identifier of a node in the network. It's a hash of public key of the node. Lip2p2 keypair is handle by its keychain. You can get the PeerID by: + +```ts +const libp2p = await createLibp2p({}); +libp2p.peerId.toString(); +``` ## CID CID is a unique fingerprint of data you can access the data as long as you know the exactly CID. The CID was calculated by hash function but it isn't data's digest. Instead the CID was calculated by digests of blocks of data. -Combining that hash with codec information about the block using multiformats: +Combining that digest with codec information about the block using multiformats: - Multihash for information on the algorithm used to hash the data. - Multicodec for information on how to interpret the hashed data after it has been fetched. @@ -24,9 +33,11 @@ A Merkle DAG is a DAG where each node has an identifier, and this is the result Our data will be stored in sub-merkle DAG. Every time we alter a leaf, it's also change the sub-merkle DAG node and it's required to recompute the CID, this will impact our implementation since we need a metadata file to keep track on CIDs and its children. +We can perform a lookup on a merkle DAG by using the CID of the root node. We can also perform a lookup on a sub-merkle DAG by using the CID of the root node of the sub-merkle DAG. DAG traversal is a recursive process that starts at the root node and ends when the desired node is found. This process is cheap and fast, since it only requires the node identifier. + ## Javascript IPFS -[js-ipfs](https://github.com/ipfs/js-ipfs) paves the way for the Browser implementation of the IPFS protocol. Written entirely in JavaScript, it runs in a Browser, a Service Worker, a Web Extension and Node.js, opening the door to a world of possibilities. Our zkDatabase utilize this package to provide accessibility to data. +[js-ipfs](https://github.com/ipfs/js-ipfs) paves the way for the Browser implementation of the IPFS protocol. Written entirely in JavaScript, it runs in a Browser, a Service Worker, a Web Extension and Node.js, opening the door to a world of possibilities. We switch to [Helia](https://github.com/ipfs/helia) due to the `js-ipfs` is discontinued. @@ -34,10 +45,16 @@ We switch to [Helia](https://github.com/ipfs/helia) due to the `js-ipfs` is disc LibP2p provide building blocks to build p2p application, it handled all p2p network related along side with its modules. It's flexible to use and develop with [libp2p](https://github.com/libp2p/js-libp2p). To config and work with libp2p you need to define: -- Transport: [TCP](https://github.com/libp2p/js-libp2p-tcp), [WebSockets](https://github.com/libp2p/js-libp2p-websockets) these two transports handle the connect in different way. TCP allowed you to handle connect natively but it's required to use `Node.js` run-time instead of browser based. WebSockets module work for both with the lesser performance. +- Transport: + - [TCP](https://github.com/libp2p/js-libp2p-tcp): TCP transport module help you to manage connection between nodes natively. TCP handles connect at transport layer (layer 4) that's why it's more efficient to maintain connection. But it's only work for `Node.js` run-time. + - [WebSockets](https://github.com/libp2p/js-libp2p-websockets): WebSocket in contrast to TCP, it's work on application layer (layer 7) that's why it's less efficient to maintain connection. But it's work for both `Node.js` and `Browser`. - Encryption: [noise](https://github.com/ChainSafe/js-libp2p-noise), we don't have any option since TLS didn't have any implement for JS. -- Multiplexer: We have two options [mplex](https://github.com/libp2p/js-libp2p-mplex) and [yamux](https://github.com/ChainSafe/js-libp2p-yamux). Multiplexer improve the performance of protocol, node handling. `mplex` is preferable for `tcp` meanwhile `yamux` is prefer for `WebSockets`. -- Node discovery: [KAD DHT](https://github.com/libp2p/js-libp2p-kad-dht) I'm prefer this library to handle node discovery and routing. I'm try the `bootstrap` but it isn't working well that's why you would see I connect the bootstrap nodes directly in the construction. +- Multiplexer: + - [mplex](https://github.com/libp2p/js-libp2p-mplex) `mplex` is a simple stream multiplexer that was designed in the early days of libp2p. It is a simple protocol that does not provide many features offered by other stream multiplexers. Notably, `mplex` does not provide flow control, a feature which is now considered critical for a stream multiplexer. `mplex` runs over a reliable, ordered pipe between two peers, such as a TCP connection. Peers can open, write to, close, and reset a stream. mplex uses a message-based framing layer like yamux, enabling it to multiplex different data streams, including stream-oriented data and other types of messages. + - [yamux](https://github.com/ChainSafe/js-libp2p-yamux). Yamux (Yet another Multiplexer) is a powerful stream multiplexer used in libp2p. It was initially developed by Hashicorp for Go, and is now implemented in Rust, JavaScript and other languages. enables multiple parallel streams on a single TCP connection. The design was inspired by SPDY (which later became the basis for HTTP/2), however it is not compatible with it. One of the key features of Yamux is its support for flow control through backpressure. This mechanism helps to prevent data from being sent faster than it can be processed. It allows the receiver to specify an offset to which the sender can send data, which increases as the receiver processes the data. This helps prevent the sender from overwhelming the receiver, especially when the receiver has limited resources or needs to process complex data. _**Note**: Yamux should be used over mplex in libp2p, as mplex doesn’t provide a mechanism to apply backpressure on the stream level._ +- Node discovery: [KAD DHT](https://github.com/libp2p/js-libp2p-kad-dht) The Kademlia Distributed Hash Table (DHT), or Kad-DHT, is a distributed hash table that is designed for P2P networks. Kad-DHT in libp2p is a subsystem based on the [Kademlia whitepaper](https://docs.libp2p.io/concepts/discovery-routing/kaddht/#:~:text=based%20on%20the-,Kademlia%20whitepaper,-.). Kad-DHT offers a way to find nodes and data on the network by using a [routing table](https://docs.libp2p.io/concepts/discovery-routing/kaddht/#:~:text=by%20using%20a-,routing%20table,-that%20organizes%20peers) that organizes peers based on how similar their keys are. + +_**Note:** KAD DHT boostrap didn't work as expected that's why you would see I connect the bootstrap nodes directly in the construction._ ```ts const nodeP2p = await createLibp2p(config); @@ -60,9 +77,7 @@ await nodeP2p.start(); ```ts return createHelia({ - blockstore: new FsBlockstore( - ((storage)).location - ), + blockstore: new FsBlockstore("./local-storage"), libp2p, }); ``` From 9385e640190c7fa26a543f9c78ba13492c347631 Mon Sep 17 00:00:00 2001 From: Chiro Hiro Date: Wed, 24 May 2023 10:58:57 +0700 Subject: [PATCH 4/4] Add comment about metadata caching at local file system --- specs/docs/distributed-storage-engine/storage-engine.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/specs/docs/distributed-storage-engine/storage-engine.md b/specs/docs/distributed-storage-engine/storage-engine.md index fdb429ca..0f20bd41 100644 --- a/specs/docs/distributed-storage-engine/storage-engine.md +++ b/specs/docs/distributed-storage-engine/storage-engine.md @@ -67,7 +67,7 @@ Since a DAG nodes are immutable but we unable to update the `CID` every time. So #### Metadata -The medata file is holding a mapping of data's poseidon hash to its `CID` that allowed us to retrieve the data from ipfs. It's also use to reconstruct the merkle tree. +The medata file is holding a mapping of data's poseidon hash to its `CID` that allowed us to retrieve the data from ipfs. It's also use to reconstruct the merkle tree. Metada is stored on IPFS and we also make a copy at local file system. #### BSON Document