diff --git a/.eslintrc.yaml b/.eslintrc.yaml index ac6062d82..6dac883ba 100644 --- a/.eslintrc.yaml +++ b/.eslintrc.yaml @@ -1,7 +1,7 @@ extends: - airbnb-base parserOptions: - ecmaVersion: 2020 + ecmaVersion: 2022 env: node: true mocha: true @@ -106,3 +106,10 @@ overrides: rules: func-names: 0 import/no-extraneous-dependencies: 0 + - files: + - src/**/*[iI]nterface.js + rules: + no-unused-vars: 0 + require-yield: 0 + class-methods-use-this: 0 + no-empty-function: 0 diff --git a/README.md b/README.md index 43dae8068..487292c43 100644 --- a/README.md +++ b/README.md @@ -19,11 +19,11 @@ - [Core](#core) - [Configuring](#configuring) - [Configuration file](#configuration-file) - - [Storage adapters](#storage-adapters) + - [Storage repositories](#storage-repositories) - [Environment variables](#environment-variables) - [Running](#running) - [Deploying](#deploying) -- [Publishing](#publishing) +- [Publishing](#publishing) - [Contributing](#contributing) - [Adding or updating a service](#adding-a-new-service-or-updating-an-existing-service) - [Core engine](#core-engine) @@ -144,7 +144,7 @@ When refering to the base folder, it means the folder where you will be `git pul 5. If you are using a special repo instance (e.g., `dating-declarations`), create a new [config file](#configuring), `config/development.json`, and add: ```json { - + "services": { "declarationsPath": "..//declarations" } @@ -185,12 +185,12 @@ The default configuration can be found in `config/default.json`. The full refere "recorder": { "versions": { "storage": { - "": "Storage adapter configuration object; see below" + "": "Storage repository configuration object; see below" } }, "snapshots": { "storage": { - "": "Storage adapter configuration object; see below" + "": "Storage repository configuration object; see below" } } }, @@ -235,9 +235,9 @@ The default configuration is merged with (and overridden by) environment-specifi If you want to change your local configuration, we suggest you create a `config/development.json` file with overridden values. An example of a production configuration file can be found in `config/production.json`. -##### Storage adapters +##### Storage repositories -Two storage adapters are currently supported: Git and MongoDB. Each one can be used independently for versions and snapshots. +Two storage repositories are currently supported: Git and MongoDB. Each one can be used independently for versions and snapshots. ###### Git diff --git a/ops/README.md b/ops/README.md index 078e9eeb4..a75ee1770 100644 --- a/ops/README.md +++ b/ops/README.md @@ -168,7 +168,7 @@ In order to automatically set up a virtual machine: 2. Install [VirtualBox](https://www.virtualbox.org/wiki/Downloads) to manage virtual machines. If you prefer Docker, or have an Apple Silicon machine, install [Docker](https://docs.docker.com/get-docker/) instead. 3. Create a dedicated SSH key with no password: `ssh-keygen -f ~/.ssh/ota-vagrant -q -N ""`. This key will be automatically used by Vagrant. -> VirtualBox is not compatible with Apple Silicon (M1…) processors. If you have such a machine, you will need to use the Docker provider. Since MongoDB cannot be installed on ARM, it is skipped in the infrastructure installation process. This means you cannot test the MongoDB storage adapter with Vagrant with an Apple Silicon processor. +> VirtualBox is not compatible with Apple Silicon (M1…) processors. If you have such a machine, you will need to use the Docker provider. Since MongoDB cannot be installed on ARM, it is skipped in the infrastructure installation process. This means you cannot test the MongoDB storage repository with Vagrant with an Apple Silicon processor. ### Launch diff --git a/package-lock.json b/package-lock.json index 411f5dbde..a2005ea37 100644 --- a/package-lock.json +++ b/package-lock.json @@ -11,6 +11,7 @@ "dependencies": { "@accordproject/markdown-cicero": "^0.15.2", "@accordproject/markdown-pdf": "^0.15.2", + "@opentermsarchive/simple-git": "^3.7.2", "abort-controller": "^3.0.0", "ajv": "^6.12.6", "archiver": "^5.3.0", @@ -46,7 +47,6 @@ "puppeteer-extra": "^3.2.3", "puppeteer-extra-plugin-stealth": "^2.9.0", "sib-api-v3-sdk": "^8.2.1", - "simple-git": "^2.47.0", "turndown": "^7.0.0", "winston": "^3.3.3", "winston-mail": "^2.0.0" @@ -790,6 +790,36 @@ "resolved": "https://registry.npmjs.org/@octokit/webhooks-types/-/webhooks-types-4.12.0.tgz", "integrity": "sha512-G0k7CoS9bK+OI7kPHgqi1KqK4WhrjDQSjy0wJI+0OTx/xvbHUIZDeqatY60ceeRINP/1ExEk6kTARboP0xavEw==" }, + "node_modules/@opentermsarchive/simple-git": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@opentermsarchive/simple-git/-/simple-git-3.7.2.tgz", + "integrity": "sha512-H47Io1nMzkehYMD2ZKyoHcG/cH1zoFzCtw77aAi9qWELa4RnziSdQsO7JYmvFNJKyUXwyyenGaEtaJe0D2LVqA==", + "dependencies": { + "@kwsites/file-exists": "^1.1.1", + "@kwsites/promise-deferred": "^1.1.1", + "debug": "^4.3.3" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/steveukx/" + } + }, + "node_modules/@opentermsarchive/simple-git/node_modules/debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "dependencies": { + "ms": "2.1.2" + }, + "engines": { + "node": ">=6.0" + }, + "peerDependenciesMeta": { + "supports-color": { + "optional": true + } + } + }, "node_modules/@sinonjs/commons": { "version": "1.8.3", "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-1.8.3.tgz", @@ -6659,32 +6689,6 @@ "url": "https://github.com/sponsors/ljharb" } }, - "node_modules/simple-git": { - "version": "2.47.0", - "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-2.47.0.tgz", - "integrity": "sha512-+HfCpqPBEZTPWiW9fPdbiPJDslM22MLqrktfzNKyI2pWaJa6DhfNVx4Mds04KZzVv5vjC9/ksw3y5gVf8ECWDg==", - "dependencies": { - "@kwsites/file-exists": "^1.1.1", - "@kwsites/promise-deferred": "^1.1.1", - "debug": "^4.3.2" - } - }, - "node_modules/simple-git/node_modules/debug": { - "version": "4.3.2", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz", - "integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==", - "dependencies": { - "ms": "2.1.2" - }, - "engines": { - "node": ">=6.0" - }, - "peerDependenciesMeta": { - "supports-color": { - "optional": true - } - } - }, "node_modules/simple-swizzle": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", @@ -8237,6 +8241,26 @@ "resolved": "https://registry.npmjs.org/@octokit/webhooks-types/-/webhooks-types-4.12.0.tgz", "integrity": "sha512-G0k7CoS9bK+OI7kPHgqi1KqK4WhrjDQSjy0wJI+0OTx/xvbHUIZDeqatY60ceeRINP/1ExEk6kTARboP0xavEw==" }, + "@opentermsarchive/simple-git": { + "version": "3.7.2", + "resolved": "https://registry.npmjs.org/@opentermsarchive/simple-git/-/simple-git-3.7.2.tgz", + "integrity": "sha512-H47Io1nMzkehYMD2ZKyoHcG/cH1zoFzCtw77aAi9qWELa4RnziSdQsO7JYmvFNJKyUXwyyenGaEtaJe0D2LVqA==", + "requires": { + "@kwsites/file-exists": "^1.1.1", + "@kwsites/promise-deferred": "^1.1.1", + "debug": "^4.3.3" + }, + "dependencies": { + "debug": { + "version": "4.3.4", + "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.4.tgz", + "integrity": "sha512-PRWFHuSU3eDtQJPvnNY7Jcket1j0t5OuOsFzPPzsekD52Zl8qUfFIPEiswXqIvHWGVHOgX+7G/vCNNhehwxfkQ==", + "requires": { + "ms": "2.1.2" + } + } + } + }, "@sinonjs/commons": { "version": "1.8.3", "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-1.8.3.tgz", @@ -12624,26 +12648,6 @@ } } }, - "simple-git": { - "version": "2.47.0", - "resolved": "https://registry.npmjs.org/simple-git/-/simple-git-2.47.0.tgz", - "integrity": "sha512-+HfCpqPBEZTPWiW9fPdbiPJDslM22MLqrktfzNKyI2pWaJa6DhfNVx4Mds04KZzVv5vjC9/ksw3y5gVf8ECWDg==", - "requires": { - "@kwsites/file-exists": "^1.1.1", - "@kwsites/promise-deferred": "^1.1.1", - "debug": "^4.3.2" - }, - "dependencies": { - "debug": { - "version": "4.3.2", - "resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz", - "integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==", - "requires": { - "ms": "2.1.2" - } - } - } - }, "simple-swizzle": { "version": "0.2.2", "resolved": "https://registry.npmjs.org/simple-swizzle/-/simple-swizzle-0.2.2.tgz", diff --git a/package.json b/package.json index b512517d8..072cc0244 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,7 @@ "license": "EUPL-1.2", "author": "ambanum", "type": "module", - "main": "index.js", + "main": "src/main.js", "bin": { "ota-lint-declarations": "./bin/lint-declarations.js", "ota-validate-declarations": "./bin/validate-declarations.js" @@ -37,6 +37,7 @@ "dependencies": { "@accordproject/markdown-cicero": "^0.15.2", "@accordproject/markdown-pdf": "^0.15.2", + "@opentermsarchive/simple-git": "^3.7.2", "abort-controller": "^3.0.0", "ajv": "^6.12.6", "archiver": "^5.3.0", @@ -72,7 +73,6 @@ "puppeteer-extra": "^3.2.3", "puppeteer-extra-plugin-stealth": "^2.9.0", "sib-api-v3-sdk": "^8.2.1", - "simple-git": "^2.47.0", "turndown": "^7.0.0", "winston": "^3.3.3", "winston-mail": "^2.0.0" diff --git a/scripts/dataset/export/index.js b/scripts/dataset/export/index.js index 2eac4d892..33947ed5e 100644 --- a/scripts/dataset/export/index.js +++ b/scripts/dataset/export/index.js @@ -3,8 +3,9 @@ import path from 'path'; import { fileURLToPath } from 'url'; import archiver from 'archiver'; +import config from 'config'; -import { instantiateVersionsStorageAdapter } from '../../../src/index.js'; +import RepositoryFactory from '../../../src/archivist/recorder/repositories/factory.js'; import * as renamer from '../../utils/renamer/index.js'; import readme from '../assets/README.template.js'; import logger from '../logger/index.js'; @@ -16,7 +17,7 @@ const fs = fsApi.promises; const ARCHIVE_FORMAT = 'zip'; // for supported formats, see https://www.archiverjs.com/docs/archive-formats export default async function generate({ archivePath, releaseDate }) { - const versionsStorageAdapter = await (instantiateVersionsStorageAdapter()).initialize(); + const versionsRepository = await RepositoryFactory.create(config.get('recorder.versions.storage')).initialize(); const archive = await initializeArchive(archivePath); @@ -28,7 +29,7 @@ export default async function generate({ archivePath, releaseDate }) { let index = 1; - for await (const version of versionsStorageAdapter.iterate()) { + for await (const version of versionsRepository.iterate()) { const { content, fetchDate } = version; const { serviceId, documentType } = renamer.applyRules(version.serviceId, version.documentType); @@ -70,7 +71,7 @@ export default async function generate({ archivePath, releaseDate }) { archive.stream.finalize(); await archive.done; - await versionsStorageAdapter.finalize(); + await versionsRepository.finalize(); return { servicesCount: services.size, diff --git a/scripts/dataset/export/index.test.js b/scripts/dataset/export/index.test.js index fa46eb83c..dbb04176f 100644 --- a/scripts/dataset/export/index.test.js +++ b/scripts/dataset/export/index.test.js @@ -8,7 +8,8 @@ import dircompare from 'dir-compare'; import mime from 'mime'; import StreamZip from 'node-stream-zip'; -import GitAdapter from '../../../src/storage-adapters/git/index.js'; +import Record from '../../../src/archivist/recorder/record.js'; +import GitRepository from '../../../src/archivist/recorder/repositories/git/index.js'; import generateArchive from './index.js'; @@ -30,6 +31,8 @@ const FOURTH_FETCH_DATE = '2022-01-01T12:12:24.000Z'; const FIRST_CONTENT = 'First Content'; const SECOND_CONTENT = 'Second Content'; +const MIME_TYPE = 'text/markdown'; + const SNAPSHOT_ID = '721ce4a63ad399ecbdb548a66d6d327e7bc97876'; const RELEASE_DATE = '2022-01-01T18:21:00.000Z'; @@ -41,49 +44,53 @@ describe('Export', () => { const TMP_PATH = path.resolve(__dirname, './tmp'); const EXPECTED_DATASET_PATH = path.resolve(__dirname, './test/fixtures/dataset'); - let storageAdapter; + let repository; let zip; before(async function () { this.timeout(10000); - storageAdapter = new GitAdapter({ + repository = new GitRepository({ ...config.get('recorder.versions.storage.git'), path: path.resolve(__dirname, '../../../', config.get('recorder.versions.storage.git.path')), }); - await storageAdapter.initialize(); + await repository.initialize(); - await storageAdapter.record({ + await repository.save(new Record({ serviceId: FIRST_SERVICE_PROVIDER_ID, documentType: FIRST_DOCUMENT_TYPE, content: FIRST_CONTENT, + mimeType: MIME_TYPE, fetchDate: FIRST_FETCH_DATE, snapshotId: SNAPSHOT_ID, - }); + })); - await storageAdapter.record({ + await repository.save(new Record({ serviceId: FIRST_SERVICE_PROVIDER_ID, documentType: FIRST_DOCUMENT_TYPE, content: SECOND_CONTENT, + mimeType: MIME_TYPE, fetchDate: SECOND_FETCH_DATE, snapshotId: SNAPSHOT_ID, - }); + })); - await storageAdapter.record({ + await repository.save(new Record({ serviceId: SECOND_SERVICE_PROVIDER_ID, documentType: FIRST_DOCUMENT_TYPE, content: FIRST_CONTENT, + mimeType: MIME_TYPE, fetchDate: THIRD_FETCH_DATE, snapshotId: SNAPSHOT_ID, - }); + })); - await storageAdapter.record({ + await repository.save(new Record({ serviceId: SECOND_SERVICE_PROVIDER_ID, documentType: SECOND_DOCUMENT_TYPE, content: FIRST_CONTENT, + mimeType: MIME_TYPE, fetchDate: FOURTH_FETCH_DATE, snapshotId: SNAPSHOT_ID, - }); + })); await generateArchive({ archivePath: ARCHIVE_PATH, @@ -97,7 +104,7 @@ describe('Export', () => { after(async () => { await fs.rm(TMP_PATH, { recursive: true }); - await storageAdapter._removeAllRecords(); + await repository.removeAll(); }); it('is an archive', () => { diff --git a/scripts/history/logger/index.js b/scripts/history/logger/index.js new file mode 100644 index 000000000..afe0491e1 --- /dev/null +++ b/scripts/history/logger/index.js @@ -0,0 +1,39 @@ +import winston from 'winston'; + +import logger from '../../../src/logger/index.js'; + +const { combine, timestamp, printf, colorize } = winston.format; + +export const format = combine( + colorize(), + timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }), + printf(({ level, message, timestamp, serviceId, type, id, current, total }) => { + let prefix = ''.padEnd(8); + + if (current && total) { + prefix = `${Number(((current) / total) * 100).toFixed(2)}%`.padEnd(8); + } + + if (serviceId) { + prefix += `${serviceId}`.padEnd(30); + } + + if (type) { + if (type.length > 50) { + type = `${type.substring(0, 48)}…`; + } + + prefix += `${type}`.padEnd(50); + } + + if (id) { + prefix += `${id}`.padEnd(42); + } + + return `${timestamp} ${level.padEnd(15)} ${prefix}${message}`; + }), +); + +logger.format = format; + +export default logger; diff --git a/scripts/history/migrate-services.js b/scripts/history/migrate-services.js new file mode 100644 index 000000000..4e3a32538 --- /dev/null +++ b/scripts/history/migrate-services.js @@ -0,0 +1,212 @@ +import fsApi from 'fs'; +import path from 'path'; +import { fileURLToPath } from 'url'; + +import config from 'config'; +import winston from 'winston'; + +import GitRepository from '../../src/archivist/recorder/repositories/git/index.js'; + +import { format } from './logger/index.js'; +import { importReadmeInGit } from './utils/index.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const ROOT_PATH = path.resolve(__dirname, '../../'); +const fs = fsApi.promises; + +const CONFIG = { + servicesToMigrate: [ 'ASICS', 'Amazon', 'Orange Money France' ], + from: { + snapshots: 'france-snapshots', + versions: 'france-versions-hash-updated', + prefixMessageToSnapshotId: 'This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/france-snapshots/commit/', + }, + to: { + snapshots: 'france-elections-snapshots', + versions: 'france-elections-versions-hash-updated', + prefixMessageToSnapshotId: 'This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/france-elections-snapshots/commit/', + }, +}; + +const counters = { + migrated: 0, + skipped: 0, +}; + +(async function main() { + console.time('Total time'); + + const migration = { + services: CONFIG.servicesToMigrate, + from: { + snapshots: { + source: new GitRepository({ + ...config.get('recorder.snapshots.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.snapshots}`), + }), + destination: new GitRepository({ + ...config.get('recorder.snapshots.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.snapshots}-migrated`), + }), + logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.from.snapshots}.log` }), new winston.transports.Console() ], format }), + }, + versions: { + source: new GitRepository({ + ...config.get('recorder.versions.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.versions}`), + }), + destination: new GitRepository({ + ...config.get('recorder.versions.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.from.versions}-migrated`), + prefixMessageToSnapshotId: CONFIG.from.prefixMessageToSnapshotId, + }), + logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.from.versions}.log` }), new winston.transports.Console() ], format }), + }, + }, + to: { + snapshots: { + source: new GitRepository({ + ...config.get('recorder.snapshots.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.snapshots}`), + }), + destination: new GitRepository({ + ...config.get('recorder.snapshots.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.snapshots}-migrated`), + }), + logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.to.snapshots}.log` }), new winston.transports.Console() ], format }), + }, + versions: { + source: new GitRepository({ + ...config.get('recorder.versions.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.versions}`), + }), + destination: new GitRepository({ + ...config.get('recorder.versions.storage.git'), + path: path.resolve(ROOT_PATH, `./data/${CONFIG.to.versions}-migrated`), + prefixMessageToSnapshotId: CONFIG.to.prefixMessageToSnapshotId, + }), + logger: winston.createLogger({ transports: [ new (winston.transports.File)({ filename: `${__dirname}/logs/${CONFIG.to.versions}.log` }), new winston.transports.Console() ], format }), + }, + }, + }; + + await initialize(migration); + + const fromSnapshotsRecords = await migration.from.snapshots.source.findAll(); + const toSnapshotsRecords = await migration.to.snapshots.source.findAll(); + const snapshotsToMigrate = fromSnapshotsRecords.filter(({ serviceId }) => migration.services.includes(serviceId)); + const fromSnapshotsRecordsToRewrite = fromSnapshotsRecords.filter(({ serviceId }) => !migration.services.includes(serviceId)); + const toSnapshotsRecordsMigrated = [ ...toSnapshotsRecords, ...snapshotsToMigrate ].sort((recordA, recordB) => new Date(recordA.fetchDate) - new Date(recordB.fetchDate)); + + const fromVersionsRecords = await migration.from.versions.source.findAll(); + const toVersionsRecords = await migration.to.versions.source.findAll(); + const versionsToMigrate = fromVersionsRecords.filter(({ serviceId }) => migration.services.includes(serviceId)); + const fromVersionsRecordsToRewrite = fromVersionsRecords.filter(({ serviceId }) => !migration.services.includes(serviceId)); + const toVersionsRecordsMigrated = [ ...toVersionsRecords, ...versionsToMigrate ].sort((recordA, recordB) => new Date(recordA.fetchDate) - new Date(recordB.fetchDate)); + + console.log('Number of snapshots in the source', fromSnapshotsRecords.length); + console.log('Number of snapshots in the target', toSnapshotsRecords.length); + console.log('Number of snapshots to migrate', snapshotsToMigrate.length); + + console.log('Number of versions in the source', fromVersionsRecords.length); + console.log('Number of versions in the target', toVersionsRecords.length); + console.log('Number of versions to migrate', versionsToMigrate.length); + + const idsMapping = {}; + + await Promise.all([ + rewriteSnapshots(migration.from.snapshots.destination, fromSnapshotsRecordsToRewrite, idsMapping, migration.from.snapshots.logger), + rewriteSnapshots(migration.to.snapshots.destination, toSnapshotsRecordsMigrated, idsMapping, migration.to.snapshots.logger), + ]); + + await fs.writeFile(path.join(__dirname, 'ids-mapping.json'), JSON.stringify(idsMapping, null, 4)); + + console.log('Snapshots migrated\n'); + + await Promise.all([ + rewriteVersions(migration.from.versions.destination, fromVersionsRecordsToRewrite, idsMapping, migration.from.versions.logger), + rewriteVersions(migration.to.versions.destination, toVersionsRecordsMigrated, idsMapping, migration.to.versions.logger), + ]); + + console.log(`Records treated: ${Object.values(counters).reduce((acc, value) => acc + value, 0)}`); + console.log(`⌙ Migrated records: ${counters.migrated}`); + console.log(`⌙ Skipped records: ${counters.skipped}`); + console.timeEnd('Total time'); + + await finalize(migration); +}()); + +async function rewriteSnapshots(repository, records, idsMapping, logger) { + let i = 1; + + for (const record of records) { + const { id: recordId } = await repository.save(record); // eslint-disable-line no-await-in-loop + + idsMapping[record.id] = recordId; // Saves the mapping between the old ID and the new one. + + if (recordId) { + logger.info({ message: `Migrated snapshot with new ID: ${recordId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length }); + counters.migrated++; + } else { + logger.info({ message: 'Skipped snapshot', serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length }); + counters.skipped++; + } + } +} + +async function rewriteVersions(repository, records, idsMapping, logger) { + let i = 1; + + for (const record of records) { + const newSnapshotId = idsMapping[record.snapshotId]; + + if (!newSnapshotId) { + throw new Error(`Snapshot ID ${record.snapshotId} not found for record ${record.id}`); + } + + record.snapshotId = newSnapshotId; + + const { id: recordId } = await repository.save(record); // eslint-disable-line no-await-in-loop + + if (recordId) { + logger.info({ message: `Migrated version with new ID: ${recordId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length }); + counters.migrated++; + } else { + logger.info({ message: 'Skipped version', serviceId: record.serviceId, type: record.documentType, id: record.id, current: i++, total: records.length }); + counters.skipped++; + } + } +} + +async function initialize(migration) { + await Promise.all([ + migration.from.snapshots.source.initialize(), + migration.from.snapshots.destination.initialize(), + migration.from.versions.source.initialize(), + migration.from.versions.destination.initialize(), + migration.to.snapshots.source.initialize(), + migration.to.snapshots.destination.initialize(), + migration.to.versions.source.initialize(), + migration.to.versions.destination.initialize(), + ]); + + return Promise.all([ + importReadmeInGit({ from: migration.from.snapshots.source, to: migration.from.snapshots.destination }), + importReadmeInGit({ from: migration.from.versions.source, to: migration.from.versions.destination }), + importReadmeInGit({ from: migration.to.snapshots.source, to: migration.to.snapshots.destination }), + importReadmeInGit({ from: migration.to.versions.source, to: migration.to.versions.destination }), + ]); +} + +async function finalize(migration) { + return Promise.all([ + migration.from.snapshots.source.finalize(), + migration.from.snapshots.destination.finalize(), + migration.from.versions.source.finalize(), + migration.from.versions.destination.finalize(), + migration.to.snapshots.source.finalize(), + migration.to.snapshots.destination.finalize(), + migration.to.versions.source.finalize(), + migration.to.versions.destination.finalize(), + ]); +} diff --git a/scripts/history/update-to-full-hash.js b/scripts/history/update-to-full-hash.js new file mode 100644 index 000000000..f01ac171d --- /dev/null +++ b/scripts/history/update-to-full-hash.js @@ -0,0 +1,61 @@ +import path from 'path'; +import { fileURLToPath } from 'url'; + +import config from 'config'; + +import GitRepository from '../../src/archivist/recorder/repositories/git/index.js'; + +import logger from './logger/index.js'; +import { importReadmeInGit } from './utils/index.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const ROOT_PATH = path.resolve(__dirname, '../../'); + +(async function main() { + console.time('Total time'); + + const versionsRepository = new GitRepository({ + ...config.get('recorder.versions.storage.git'), + path: path.resolve(ROOT_PATH, './data/france-elections-versions'), + }); + + const versionsTargetRepository = new GitRepository({ + ...config.get('recorder.versions.storage.git'), + prefixMessageToSnapshotId: 'This version was recorded after filtering snapshot https://github.com/OpenTermsArchive/france-elections-snapshots/commit/', + path: path.resolve(ROOT_PATH, './data/france-elections-versions-hash-updated-test'), + }); + + const snapshotsRepository = new GitRepository({ + ...config.get('recorder.snapshots.storage.git'), + path: path.resolve(ROOT_PATH, './data/france-elections-snapshots'), + }); + + await versionsRepository.initialize(); + await versionsTargetRepository.initialize(); + await snapshotsRepository.initialize(); + + await importReadmeInGit({ from: versionsRepository, to: versionsTargetRepository }); + + const total = await versionsRepository.count(); + let current = 1; + + for await (const record of versionsRepository.iterate()) { + const fullSnapshotId = await snapshotsRepository.git.getFullHash(record.snapshotId); + + record.snapshotId = fullSnapshotId; + + const { id: recordId } = await versionsTargetRepository.save(record); + + if (!recordId) { + logger.warn({ message: 'Record skipped', serviceId: record.serviceId, type: record.documentType, id: record.id, current, total }); + } else { + logger.info({ message: `Update short sha ${record.snapshotId} to ${fullSnapshotId}`, serviceId: record.serviceId, type: record.documentType, id: record.id, current, total }); + } + + current++; + } + + await versionsRepository.finalize(); + await versionsTargetRepository.finalize(); + await snapshotsRepository.finalize(); +}()); diff --git a/scripts/history/utils/index.js b/scripts/history/utils/index.js new file mode 100644 index 000000000..b242d35b8 --- /dev/null +++ b/scripts/history/utils/index.js @@ -0,0 +1,23 @@ +import fsApi from 'fs'; + +const fs = fsApi.promises; + +export async function importReadmeInGit({ from: sourceRepository, to: targetRepository }) { + const sourceRepositoryReadmePath = `${sourceRepository.path}/README.md`; + const targetRepositoryReadmePath = `${targetRepository.path}/README.md`; + + const [firstReadmeCommit] = await sourceRepository.git.log(['README.md']); + + if (!firstReadmeCommit) { + console.warn(`No commit found for README in ${sourceRepository.path}`); + + return; + } + + await fs.copyFile(sourceRepositoryReadmePath, targetRepositoryReadmePath); + await targetRepository._commit({ + filePath: targetRepositoryReadmePath, + message: firstReadmeCommit.message, + date: firstReadmeCommit.date, + }); +} diff --git a/scripts/import/index.js b/scripts/import/index.js index 1436b10a4..1646c5104 100644 --- a/scripts/import/index.js +++ b/scripts/import/index.js @@ -8,7 +8,7 @@ import mime from 'mime'; import { MongoClient } from 'mongodb'; import nodeFetch from 'node-fetch'; -import Git from '../../src/storage-adapters/git/git.js'; +import Git from '../../src/archivist/recorder/repositories/git/git.js'; import * as renamer from '../utils/renamer/index.js'; import logger from './logger/index.js'; diff --git a/scripts/import/loadCommits.js b/scripts/import/loadCommits.js index 7c4b501a5..4718b51b9 100644 --- a/scripts/import/loadCommits.js +++ b/scripts/import/loadCommits.js @@ -5,7 +5,7 @@ import { fileURLToPath } from 'url'; import config from 'config'; import { MongoClient } from 'mongodb'; -import Git from '../../src/storage-adapters/git/git.js'; +import Git from '../../src/archivist/recorder/repositories/git/git.js'; import logger from './logger/index.js'; diff --git a/scripts/rewrite/initializer/index.js b/scripts/rewrite/initializer/index.js index 73beb07cb..c69b7d70d 100644 --- a/scripts/rewrite/initializer/index.js +++ b/scripts/rewrite/initializer/index.js @@ -4,7 +4,7 @@ import { fileURLToPath } from 'url'; import config from 'config'; -import Git from '../../../src/storage-adapters/git/git.js'; +import Git from '../../../src/archivist/recorder/repositories/git/git.js'; import { fileExists } from '../utils.js'; const fs = fsApi.promises; @@ -21,7 +21,10 @@ export async function initReadmeAndLicense(targetRepo, targetPath, authorDate) { await fs.copyFile(LICENSE_PATH, targetLicenseFilePath); await targetRepo.add(targetReadmeFilePath); await targetRepo.add(targetLicenseFilePath); - await targetRepo.commit(null, 'Add Readme and License', authorDate); + await targetRepo.commit({ + message: 'Add readme and license', + date: authorDate, + }); } export async function initTargetRepo(targetRepoPath) { diff --git a/scripts/rewrite/rewrite-snapshots.js b/scripts/rewrite/rewrite-snapshots.js index 19e4fec94..eb9cbc96f 100644 --- a/scripts/rewrite/rewrite-snapshots.js +++ b/scripts/rewrite/rewrite-snapshots.js @@ -4,8 +4,8 @@ import { fileURLToPath } from 'url'; import config from 'config'; import Recorder from '../../src/archivist/recorder/index.js'; -import Git from '../../src/storage-adapters/git/git.js'; -import GitAdapter from '../../src/storage-adapters/git/index.js'; +import Git from '../../src/archivist/recorder/repositories/git/git.js'; +import GitRepository from '../../src/archivist/recorder/repositories/git/index.js'; import * as renamer from '../utils/renamer/index.js'; import * as initializer from './initializer/index.js'; @@ -50,11 +50,11 @@ let recorder; } recorder = new Recorder({ - versionsStorageAdapter: new GitAdapter({ + versionsRepository: new GitRepository({ ...config.get('recorder.versions.storage.git'), path: VERSIONS_TARGET_PATH, }), - snapshotsStorageAdapter: new GitAdapter({ + snapshotsRepository: new GitRepository({ ...config.get('recorder.snapshots.storage.git'), path: SNAPSHOTS_TARGET_PATH, }), diff --git a/scripts/rewrite/rewrite-versions.js b/scripts/rewrite/rewrite-versions.js index 47d734149..8d6db49ac 100644 --- a/scripts/rewrite/rewrite-versions.js +++ b/scripts/rewrite/rewrite-versions.js @@ -6,9 +6,9 @@ import config from 'config'; import { InaccessibleContentError } from '../../src/archivist/errors.js'; import filter from '../../src/archivist/filter/index.js'; import Recorder from '../../src/archivist/recorder/index.js'; +import Git from '../../src/archivist/recorder/repositories/git/git.js'; +import GitRepository from '../../src/archivist/recorder/repositories/git/index.js'; import * as services from '../../src/archivist/services/index.js'; -import Git from '../../src/storage-adapters/git/git.js'; -import GitAdapter from '../../src/storage-adapters/git/index.js'; import * as renamer from '../utils/renamer/index.js'; import * as initializer from './initializer/index.js'; @@ -59,11 +59,11 @@ let recorder; } recorder = new Recorder({ - versionsStorageAdapter: new GitAdapter({ + versionsRepository: new GitRepository({ ...config.get('recorder.versions.storage.git'), path: VERSIONS_TARGET_PATH, }), - snapshotsStorageAdapter: new GitAdapter({ + snapshotsRepository: new GitRepository({ ...config.get('recorder.snapshots.storage.git'), path: SNAPSHOTS_SOURCE_PATH, }), diff --git a/src/archivist/fetcher/index.test.js b/src/archivist/fetcher/index.test.js index d9c350cbc..733a90de5 100644 --- a/src/archivist/fetcher/index.test.js +++ b/src/archivist/fetcher/index.test.js @@ -69,7 +69,7 @@ describe('Fetcher', function () { expect(content).to.equal(termsHTML); }); - it('returns the mime type of the given URL', async () => { + it('returns the MIME type of the given URL', async () => { expect(mimeType).to.equal('text/html'); }); @@ -82,7 +82,7 @@ describe('Fetcher', function () { expect(content).to.equal(termsHTML); }); - it('returns the mime type of the given URL', async () => { + it('returns the MIME type of the given URL', async () => { expect(mimeType).to.equal('text/html'); }); }); @@ -99,7 +99,7 @@ describe('Fetcher', function () { expect(content).to.equal(termsHTML); }); - it('returns the mime type of the given URL', async () => { + it('returns the MIME type of the given URL', async () => { expect(mimeType).to.equal('text/html'); }); @@ -112,7 +112,7 @@ describe('Fetcher', function () { expect(content).to.equal(termsHTML); }); - it('returns the mime type of the given URL', async () => { + it('returns the MIME type of the given URL', async () => { expect(mimeType).to.equal('text/html'); }); }); diff --git a/src/archivist/index.js b/src/archivist/index.js index 498768221..ff86b1607 100644 --- a/src/archivist/index.js +++ b/src/archivist/index.js @@ -41,9 +41,9 @@ export default class Archivist extends events.EventEmitter { return Object.keys(this.services); } - constructor({ storage: { versions, snapshots } }) { + constructor({ recorderConfig }) { super(); - this.recorder = new Recorder({ versionsStorageAdapter: versions, snapshotsStorageAdapter: snapshots }); + this.recorder = new Recorder(recorderConfig); } async initialize() { @@ -105,7 +105,7 @@ export default class Archivist extends events.EventEmitter { await Promise.all([ launchHeadlessBrowser(), this.recorder.initialize() ]); - this._forEachDocumentOf(servicesIds, documentDeclaration => this.trackDocumentChangesQueue.push(documentDeclaration)); + this.#forEachDocumentOf(servicesIds, documentDeclaration => this.trackDocumentChangesQueue.push(documentDeclaration)); await this.trackDocumentChangesQueue.drain(); @@ -164,7 +164,7 @@ export default class Archivist extends events.EventEmitter { await this.recorder.initialize(); - this._forEachDocumentOf(servicesIds, documentDeclaration => this.refilterDocumentsQueue.push(documentDeclaration)); + this.#forEachDocumentOf(servicesIds, documentDeclaration => this.refilterDocumentsQueue.push(documentDeclaration)); await this.refilterDocumentsQueue.drain(); await this.recorder.finalize(); @@ -194,7 +194,7 @@ export default class Archivist extends events.EventEmitter { }); } - async _forEachDocumentOf(servicesIds = [], callback) { // eslint-disable-line default-param-last + async #forEachDocumentOf(servicesIds = [], callback) { // eslint-disable-line default-param-last servicesIds.forEach(serviceId => { this.services[serviceId].getDocumentTypes().forEach(documentType => { callback(this.services[serviceId].getDocumentDeclaration(documentType)); diff --git a/src/archivist/index.test.js b/src/archivist/index.test.js index ca6387f64..93f037332 100644 --- a/src/archivist/index.test.js +++ b/src/archivist/index.test.js @@ -8,8 +8,7 @@ import nock from 'nock'; import sinon from 'sinon'; import sinonChai from 'sinon-chai'; -import Git from '../storage-adapters/git/git.js'; -import GitAdapter from '../storage-adapters/git/index.js'; +import Git from './recorder/repositories/git/git.js'; import Archivist, { AVAILABLE_EVENTS } from './index.js'; @@ -26,16 +25,13 @@ const VERSIONS_PATH = path.resolve(ROOT_PATH, config.get('recorder.versions.stor const MIME_TYPE = 'text/html'; const FETCH_DATE = new Date('2000-01-02T12:00:00.000Z'); - -let snapshotsStorageAdapter; -let versionsStorageAdapter; +let gitVersion; +let app; async function resetGitRepositories() { - return Promise.all([ snapshotsStorageAdapter._removeAllRecords(), versionsStorageAdapter._removeAllRecords() ]); + return Promise.all([ app.recorder.snapshotsRepository.removeAll(), app.recorder.versionsRepository.removeAll() ]); } -let gitVersion; - describe('Archivist', function () { this.timeout(10000); @@ -69,23 +65,13 @@ describe('Archivist', function () { serviceAVersionExpectedContent = await fs.readFile(path.resolve(ROOT_PATH, 'test/fixtures/service_A_terms.md'), { encoding: 'utf8' }); serviceBSnapshotExpectedContent = await fs.readFile(path.resolve(ROOT_PATH, 'test/fixtures/terms.pdf')); serviceBVersionExpectedContent = await fs.readFile(path.resolve(ROOT_PATH, 'test/fixtures/termsFromPDF.md'), { encoding: 'utf8' }); - snapshotsStorageAdapter = new GitAdapter({ - ...config.get('recorder.snapshots.storage.git'), - path: SNAPSHOTS_PATH, - }); - versionsStorageAdapter = new GitAdapter({ - ...config.get('recorder.versions.storage.git'), - path: VERSIONS_PATH, - }); }); describe('#trackChanges', () => { - let app; - before(async () => { nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' }); nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' }); - app = new Archivist({ storage: { versions: versionsStorageAdapter, snapshots: snapshotsStorageAdapter } }); + app = new Archivist({ recorderConfig: config.get('recorder') }); await app.initialize(); }); @@ -163,13 +149,13 @@ describe('Archivist', function () { before(async () => { nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' }); nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' }); - const app = new Archivist({ storage: { versions: versionsStorageAdapter, snapshots: snapshotsStorageAdapter } }); + app = new Archivist({ recorderConfig: config.get('recorder') }); await app.initialize(); await app.trackChanges(serviceIds); - ({ id: originalSnapshotId } = await snapshotsStorageAdapter.getLatestRecord(SERVICE_A_ID, SERVICE_A_TYPE)); - ({ id: firstVersionId } = await versionsStorageAdapter.getLatestRecord(SERVICE_A_ID, SERVICE_A_TYPE)); + ({ id: originalSnapshotId } = await app.recorder.snapshotsRepository.findLatest(SERVICE_A_ID, SERVICE_A_TYPE)); + ({ id: firstVersionId } = await app.recorder.versionsRepository.findLatest(SERVICE_A_ID, SERVICE_A_TYPE)); serviceBCommits = await gitVersion.log({ file: SERVICE_B_EXPECTED_VERSION_FILE_PATH }); @@ -219,7 +205,7 @@ describe('Archivist', function () { before(async () => { nock('https://www.servicea.example').get('/tos').reply(200, serviceASnapshotExpectedContent, { 'Content-Type': 'text/html' }); nock('https://www.serviceb.example').get('/privacy').reply(200, serviceBSnapshotExpectedContent, { 'Content-Type': 'application/pdf' }); - const app = new Archivist({ storage: { versions: versionsStorageAdapter, snapshots: snapshotsStorageAdapter } }); + app = new Archivist({ recorderConfig: config.get('recorder') }); await app.initialize(); await app.trackChanges(serviceIds); @@ -265,7 +251,7 @@ describe('Archivist', function () { } before(async () => { - app = new Archivist({ storage: { versions: versionsStorageAdapter, snapshots: snapshotsStorageAdapter } }); + app = new Archivist({ recorderConfig: config.get('recorder') }); await app.initialize(); AVAILABLE_EVENTS.forEach(event => { diff --git a/src/archivist/recorder/index.js b/src/archivist/recorder/index.js index c20f444b2..2b2a7c1ef 100644 --- a/src/archivist/recorder/index.js +++ b/src/archivist/recorder/index.js @@ -1,23 +1,22 @@ -export default class Recorder { - constructor({ versionsStorageAdapter, snapshotsStorageAdapter }) { - if (!versionsStorageAdapter || !snapshotsStorageAdapter) { - throw new RangeError('Storage adapters should be defined both for versions and snapshots'); - } +import Record from './record.js'; +import RepositoryFactory from './repositories/factory.js'; - this.versionsStorageAdapter = versionsStorageAdapter; - this.snapshotsStorageAdapter = snapshotsStorageAdapter; +export default class Recorder { + constructor(config) { + this.versionsRepository = RepositoryFactory.create(config.versions.storage); + this.snapshotsRepository = RepositoryFactory.create(config.snapshots.storage); } async initialize() { - return Promise.all([ this.versionsStorageAdapter.initialize(), this.snapshotsStorageAdapter.initialize() ]); + return Promise.all([ this.versionsRepository.initialize(), this.snapshotsRepository.initialize() ]); } async finalize() { - return Promise.all([ this.versionsStorageAdapter.finalize(), this.snapshotsStorageAdapter.finalize() ]); + return Promise.all([ this.versionsRepository.finalize(), this.snapshotsRepository.finalize() ]); } async getLatestSnapshot(serviceId, documentType) { - return this.snapshotsStorageAdapter.getLatestRecord(serviceId, documentType); + return this.snapshotsRepository.findLatest(serviceId, documentType); } async recordSnapshot({ serviceId, documentType, fetchDate, mimeType, content }) { @@ -41,7 +40,7 @@ export default class Recorder { throw new Error('A document mime type is required to ensure data consistency'); } - return this.snapshotsStorageAdapter.record({ serviceId, documentType, fetchDate, mimeType, content }); + return this.snapshotsRepository.save(new Record({ serviceId, documentType, fetchDate, mimeType, content })); } async recordVersion({ serviceId, documentType, snapshotId, fetchDate, mimeType, content, isRefilter }) { @@ -69,7 +68,7 @@ export default class Recorder { throw new Error('A document mime type is required to ensure data consistency'); } - return this.versionsStorageAdapter.record({ serviceId, documentType, snapshotId, fetchDate, mimeType, content, isRefilter }); + return this.versionsRepository.save(new Record({ serviceId, documentType, snapshotId, fetchDate, mimeType, content, isRefilter })); } async recordRefilter(params) { diff --git a/src/archivist/recorder/index.test.js b/src/archivist/recorder/index.test.js index 5f150357a..b69c6c861 100644 --- a/src/archivist/recorder/index.test.js +++ b/src/archivist/recorder/index.test.js @@ -1,19 +1,8 @@ -import path from 'path'; -import { fileURLToPath } from 'url'; - import chai from 'chai'; import config from 'config'; -import GitAdapter from '../../storage-adapters/git/index.js'; -import MongoAdapter from '../../storage-adapters/mongo/index.js'; - import Recorder from './index.js'; -const __dirname = path.dirname(fileURLToPath(import.meta.url)); - -export const SNAPSHOTS_PATH = path.resolve(__dirname, '../../../', config.get('recorder.snapshots.storage.git.path')); -export const VERSIONS_PATH = path.resolve(__dirname, '../../../', config.get('recorder.versions.storage.git.path')); - const { expect } = chai; const MIME_TYPE = 'text/html'; @@ -24,25 +13,8 @@ describe('Recorder', () => { const SERVICE_ID = 'test_service'; const TYPE = 'Terms of Service'; - const adaptersTypes = { - git: { - snapshots: new GitAdapter({ - ...config.get('recorder.snapshots.storage.git'), - path: SNAPSHOTS_PATH, - }), - versions: new GitAdapter({ - ...config.get('recorder.versions.storage.git'), - path: VERSIONS_PATH, - }), - }, - mongo: { - snapshots: new MongoAdapter(config.get('recorder.versions.storage.mongo')), - versions: new MongoAdapter(config.get('recorder.snapshots.storage.mongo')), - }, - }; - - for (const [ adapterName, { versions: versionsAdapter, snapshots: snapshotsAdapter }] of Object.entries(adaptersTypes)) { - describe(adapterName, () => { + for (const repositoryType of [ 'git', 'mongo' ]) { + describe(repositoryType, () => { describe('#recordSnapshot', () => { const CONTENT = '

ToS fixture data with UTF-8 çhãràčtęrs

'; let recorder; @@ -51,20 +23,19 @@ describe('Recorder', () => { let record; before(async () => { - recorder = new Recorder({ - versionsStorageAdapter: versionsAdapter, - snapshotsStorageAdapter: snapshotsAdapter, - }); + const options = config.util.cloneDeep(config.recorder); + + options.versions.storage.type = repositoryType; + options.snapshots.storage.type = repositoryType; + + recorder = new Recorder(options); await recorder.initialize(); }); - after(async () => { - await snapshotsAdapter._removeAllRecords(); - await recorder.finalize(); - }); + after(async () => recorder.finalize()); context('when a required param is missing', () => { - after(async () => snapshotsAdapter._removeAllRecords()); + after(async () => recorder.snapshotsRepository.removeAll()); const validParams = { serviceId: SERVICE_ID, @@ -111,13 +82,13 @@ describe('Recorder', () => { fetchDate: FETCH_DATE, })); - record = await snapshotsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.snapshotsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => snapshotsAdapter._removeAllRecords()); + after(async () => recorder.snapshotsRepository.removeAll()); it('records the document with the proper content', async () => { - expect(record.content).to.equal(CONTENT); + expect(await record.content).to.equal(CONTENT); }); it('returns the record id', async () => { @@ -149,13 +120,13 @@ describe('Recorder', () => { fetchDate: FETCH_DATE_LATER, })); - record = await snapshotsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.snapshotsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => snapshotsAdapter._removeAllRecords()); + after(async () => recorder.snapshotsRepository.removeAll()); it('records the document with the proper content', async () => { - expect(record.content).to.equal(UPDATED_CONTENT); + expect(await record.content).to.equal(UPDATED_CONTENT); }); it('returns the record id', async () => { @@ -185,10 +156,10 @@ describe('Recorder', () => { fetchDate: FETCH_DATE_LATER, })); - record = await snapshotsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.snapshotsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => snapshotsAdapter._removeAllRecords()); + after(async () => recorder.snapshotsRepository.removeAll()); it('does not record the document', async () => { expect(id).to.not.be.ok; @@ -205,19 +176,14 @@ describe('Recorder', () => { let record; before(async () => { - recorder = new Recorder({ - versionsStorageAdapter: versionsAdapter, - snapshotsStorageAdapter: snapshotsAdapter, - }); + recorder = new Recorder(config.get('recorder')); await recorder.initialize(); }); - after(async () => { - await recorder.finalize(); - }); + after(async () => recorder.finalize()); context('when a required param is missing', () => { - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); const validParams = { serviceId: SERVICE_ID, @@ -267,13 +233,13 @@ describe('Recorder', () => { fetchDate: FETCH_DATE, })); - record = await versionsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); it('records the document with the proper content', async () => { - expect(record.content).to.equal(CONTENT); + expect(await record.content).to.equal(CONTENT); }); it('returns the record id', async () => { @@ -307,13 +273,13 @@ describe('Recorder', () => { fetchDate: FETCH_DATE_LATER, })); - record = await versionsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); it('records the document with the proper content', async () => { - expect(record.content).to.equal(UPDATED_CONTENT); + expect(await record.content).to.equal(UPDATED_CONTENT); }); it('records in the document that it is not a refilter', async () => { @@ -349,10 +315,10 @@ describe('Recorder', () => { fetchDate: FETCH_DATE_LATER, })); - record = await versionsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); it('does not record the document', async () => { expect(id).to.not.be.ok; @@ -369,20 +335,14 @@ describe('Recorder', () => { let record; before(async () => { - recorder = new Recorder({ - versionsStorageAdapter: versionsAdapter, - snapshotsStorageAdapter: snapshotsAdapter, - }); + recorder = new Recorder(config.get('recorder')); await recorder.initialize(); }); - after(async () => { - await versionsAdapter._removeAllRecords(); - await recorder.finalize(); - }); + after(async () => recorder.finalize()); context('when a required param is missing', () => { - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); const validParams = { serviceId: SERVICE_ID, @@ -432,13 +392,13 @@ describe('Recorder', () => { fetchDate: FETCH_DATE, })); - record = await versionsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => versionsAdapter._removeAllRecords()); after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); after(async () => recorder.versionsRepository.removeAll()); it('records the document with the proper content', async () => { - expect(record.content).to.equal(CONTENT); + expect(await record.content).to.equal(CONTENT); }); it('returns the record id', async () => { @@ -472,13 +432,13 @@ describe('Recorder', () => { fetchDate: FETCH_DATE_LATER, })); - record = await versionsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); it('records the document with the proper content', async () => { - expect(record.content).to.equal(UPDATED_CONTENT); + expect(await record.content).to.equal(UPDATED_CONTENT); }); it('records in the document that it is a refilter', async () => { @@ -514,10 +474,10 @@ describe('Recorder', () => { fetchDate: FETCH_DATE_LATER, })); - record = await versionsAdapter.getLatestRecord(SERVICE_ID, TYPE); + record = await recorder.versionsRepository.findLatest(SERVICE_ID, TYPE); }); - after(async () => versionsAdapter._removeAllRecords()); + after(async () => recorder.versionsRepository.removeAll()); it('does not record the document', async () => { expect(id).to.not.be.ok; diff --git a/src/archivist/recorder/record.js b/src/archivist/recorder/record.js new file mode 100644 index 000000000..f21b4bd69 --- /dev/null +++ b/src/archivist/recorder/record.js @@ -0,0 +1,35 @@ +export default class Record { + #content; + + static REQUIRED_PARAMS = Object.freeze([ 'serviceId', 'documentType', 'mimeType', 'fetchDate' ]); + + constructor(params) { + Record.validate(params); + + Object.assign(this, Object.fromEntries(Object.entries(params))); + + if (params.content) { + this.#content = params.content; + } + } + + get content() { + if (this.#content === undefined) { + throw new Error('Record content not defined, set the content or use Repository#loadRecordContent'); + } + + return this.#content; + } + + set content(content) { + this.#content = content; + } + + static validate(givenParams) { + for (const param of Record.REQUIRED_PARAMS) { + if (!Object.prototype.hasOwnProperty.call(givenParams, param) || givenParams[param] == null) { + throw new Error(`"${param}" is required`); + } + } + } +} diff --git a/src/archivist/recorder/record.test.js b/src/archivist/recorder/record.test.js new file mode 100644 index 000000000..99a330ad3 --- /dev/null +++ b/src/archivist/recorder/record.test.js @@ -0,0 +1,91 @@ +import chai from 'chai'; +import config from 'config'; + +import Record from './record.js'; +import RepositoryFactory from './repositories/factory.js'; + +const { expect } = chai; + +describe('Record', () => { + let repository; + let subject; + const REQUIRED_PARAMS = [ 'serviceId', 'documentType', 'mimeType', 'fetchDate' ]; + const recordParams = { + serviceId: 'ServiceA', + documentType: 'Terms of Service', + mimeType: 'text/html', + fetchDate: new Date('2000-01-01T12:00:00.000Z'), + }; + + describe('Validation', () => { + describe('Required paramaters', () => { + REQUIRED_PARAMS.forEach(requiredParam => { + describe(`"${requiredParam}"`, () => { + context('when it is missing', () => { + it('throws an error', async () => { + try { + const params = {}; + + Object.keys(recordParams).filter(param => param != requiredParam).forEach(param => { + params[param] = recordParams[param]; + }); + + subject = new Record({ ...params }); + } catch (e) { + expect(e).to.be.an('error'); + expect(e.message).to.have.string(`"${requiredParam}" is required`); + + return; + } + expect.fail('No error was thrown'); + }); + }); + + context('when it is null', () => { + it('throws an error', async () => { + try { + subject = new Record({ ...recordParams, [requiredParam]: null }); + } catch (e) { + expect(e).to.be.an('error'); + expect(e.message).to.have.string(`"${requiredParam}" is required`); + + return; + } + expect.fail('No error was thrown'); + }); + }); + }); + }); + }); + }); + + describe('Content access', () => { + before(async () => { + repository = await RepositoryFactory.create(config.get('recorder.versions.storage')).initialize(); + await repository.save(new Record({ + ...recordParams, + content: 'content', + })); + ([subject] = await repository.findAll()); + }); + + after(async () => { + await repository.removeAll(); + await repository.finalize(); + }); + + context('when it is neither defined nor loaded', () => { + it('throws an error explaining how to recover', async () => { + try { + console.log(subject.content); + } catch (e) { + expect(e).to.be.an('error'); + expect(e.message).to.have.string('set the content or use Repository#loadRecordContent'); + + return; + } + expect.fail('No error was thrown'); + }); + }); + }); +}); diff --git a/src/archivist/recorder/repositories/factory.js b/src/archivist/recorder/repositories/factory.js new file mode 100644 index 000000000..c252cd609 --- /dev/null +++ b/src/archivist/recorder/repositories/factory.js @@ -0,0 +1,23 @@ +import path from 'path'; +import { fileURLToPath } from 'url'; + +import GitRepository from './git/index.js'; +import MongoRepository from './mongo/index.js'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); + +export default class RepositoryFactory { + static create(params) { + switch (params.type) { + case 'git': + return new GitRepository({ + ...params.git, + path: path.resolve(__dirname, '../../../../', params.git.path), + }); + case 'mongo': + return new MongoRepository(params.mongo); + default: + throw new Error(`Unknown storage repository configuration for type '${params.type}'`); + } + } +} diff --git a/src/archivist/recorder/repositories/git/dataMapper.js b/src/archivist/recorder/repositories/git/dataMapper.js new file mode 100644 index 000000000..fc8e23f74 --- /dev/null +++ b/src/archivist/recorder/repositories/git/dataMapper.js @@ -0,0 +1,59 @@ +import path from 'path'; + +import mime from 'mime'; + +import Record from '../../record.js'; + +mime.define({ 'text/markdown': ['md'] }, true); // ensure extension for markdown files is `.md` and not `.markdown` + +export const COMMIT_MESSAGE_PREFIX = { + startTracking: 'Start tracking', + refilter: 'Refilter', + update: 'Update', +}; + +export const COMMIT_MESSAGE_PREFIXES_REGEXP = new RegExp(`^(${COMMIT_MESSAGE_PREFIX.startTracking}|${COMMIT_MESSAGE_PREFIX.refilter}|${COMMIT_MESSAGE_PREFIX.update})`); + +export function toPersistence(record, prefixMessageToSnapshotId) { + const { serviceId, documentType, isRefilter, snapshotId, mimeType, isFirstRecord } = record; + + let prefix = isRefilter ? COMMIT_MESSAGE_PREFIX.refilter : COMMIT_MESSAGE_PREFIX.update; + + prefix = isFirstRecord ? COMMIT_MESSAGE_PREFIX.startTracking : prefix; + + let message = `${prefix} ${serviceId} ${documentType}`; + + if (snapshotId) { + message = `${message}\n\n${prefixMessageToSnapshotId}${snapshotId}`; + } + + return { + message, + content: record.content, + fileExtension: mime.getExtension(mimeType), + }; +} + +export function toDomain(commit) { + const { hash, date, message, body, diff } = commit; + + const modifiedFilesInCommit = diff.files.map(({ file }) => file); + + if (modifiedFilesInCommit.length > 1) { + throw new Error(`Only one document should have been recorded in ${hash}, but all these documents were recorded: ${modifiedFilesInCommit.join(', ')}`); + } + + const [relativeFilePath] = modifiedFilesInCommit; + const snapshotIdMatch = body.match(/\b[0-9a-f]{5,40}\b/g); + + return new Record({ + id: hash, + serviceId: path.dirname(relativeFilePath), + documentType: path.basename(relativeFilePath, path.extname(relativeFilePath)), + mimeType: mime.getType(relativeFilePath), + fetchDate: new Date(date), + isFirstRecord: message.startsWith(COMMIT_MESSAGE_PREFIX.startTracking), + isRefilter: message.startsWith(COMMIT_MESSAGE_PREFIX.refilter), + snapshotId: snapshotIdMatch && snapshotIdMatch[0], + }); +} diff --git a/src/storage-adapters/git/git.js b/src/archivist/recorder/repositories/git/git.js similarity index 65% rename from src/storage-adapters/git/git.js rename to src/archivist/recorder/repositories/git/git.js index 5b2b784e8..5d7b8a60f 100644 --- a/src/storage-adapters/git/git.js +++ b/src/archivist/recorder/repositories/git/git.js @@ -1,7 +1,7 @@ import fsApi from 'fs'; import path from 'path'; -import simpleGit from 'simple-git'; +import simpleGit from '@opentermsarchive/simple-git'; process.env.LC_ALL = 'en_GB'; // Ensure git messages will be in English as some errors are handled by analysing the message content @@ -32,19 +32,19 @@ export default class Git { return this.git.add(this.relativePath(filepath)); } - async commit(filepath, message, authorDate) { - const options = {}; + async commit({ filepath, message, date = new Date() }) { + let summary; - if (authorDate) { - options['--date'] = new Date(authorDate).toISOString(); - } + try { + const commitDate = new Date(date).toISOString(); - let summary; + process.env.GIT_AUTHOR_DATE = commitDate; + process.env.GIT_COMMITTER_DATE = commitDate; - if (filepath) { - summary = await this.git.commit(message, this.relativePath(filepath), options); - } else { - summary = await this.git.commit(message, options); + summary = await this.git.commit(message, filepath); + } finally { + process.env.GIT_AUTHOR_DATE = ''; + process.env.GIT_COMMITTER_DATE = ''; } if (!summary.commit) { // Nothing committed, no hash to return @@ -52,15 +52,24 @@ export default class Git { } const shortHash = summary.commit.replace('HEAD ', '').replace('(root-commit) ', ''); - const longHash = (await this.git.show([ shortHash, '--pretty=%H', '-s' ])).trim(); - return longHash; // Return a long commit hash to always handle ids in the same format and facilitate comparison + return this.getFullHash(shortHash); // Return a long commit hash to always handle ids in the same format and facilitate comparison } async pushChanges() { return this.git.push(); } + async listCommits(options = []) { + return this.log([ '--reverse', '--no-merges', '--name-only', ...options ]); + } + + async getCommit(options) { + const [commit] = await this.listCommits([ '-1', ...options ]); + + return commit; + } + async log(options = {}) { try { options.file = options.file && this.relativePath(options.file); @@ -87,25 +96,6 @@ export default class Git { return Boolean(result); } - async findUnique(glob) { - const [latestCommit] = await this.log([ '-n', '1', '--stat=4096', glob ]); - - if (!latestCommit) { - return {}; - } - - const filePaths = latestCommit.diff.files.map(file => file.file); - - if (filePaths.length > 1) { - throw new Error(`Only one document should have been recorded in ${latestCommit.hash}, but all these documents were recorded: ${filePaths}`); - } - - return { - commit: latestCommit, - filePath: filePaths[0], - }; - } - async checkout(options) { return this.git.checkout(options); } @@ -114,6 +104,18 @@ export default class Git { return this.git.raw(options); } + async show(options) { + return this.git.show(options); + } + + async getFullHash(shortHash) { + return (await this.git.show([ shortHash, '--pretty=%H', '-s' ])).trim(); + } + + async restore(path, commit) { + return this.git.raw([ 'restore', '-s', commit, '--', path ]); + } + relativePath(absolutePath) { // Git needs a path relative to the .git directory, not an absolute one return path.relative(this.path, absolutePath); diff --git a/src/archivist/recorder/repositories/git/index.js b/src/archivist/recorder/repositories/git/index.js new file mode 100644 index 000000000..0de415b95 --- /dev/null +++ b/src/archivist/recorder/repositories/git/index.js @@ -0,0 +1,203 @@ +/** + * This module is the boundary beyond which the usage of git is abstracted. + * Commit SHAs are used as opaque unique IDs. + */ + +import fsApi from 'fs'; +import path from 'path'; + +import mime from 'mime'; + +import RepositoryInterface from '../interface.js'; + +import * as DataMapper from './dataMapper.js'; +import Git from './git.js'; + +const fs = fsApi.promises; +const PDF_MIME_TYPE = 'application/pdf'; + +mime.define({ 'text/markdown': ['md'] }, true); // ensure extension for markdown files is `.md` and not `.markdown` + +export default class GitRepository extends RepositoryInterface { + constructor({ path, author, publish, prefixMessageToSnapshotId }) { + super(); + this.path = path; + this.needsPublication = publish; + this.git = new Git({ path: this.path, author }); + this.prefixMessageToSnapshotId = prefixMessageToSnapshotId; + } + + async initialize() { + await this.git.initialize(); + + return this; + } + + async save(record) { + const { serviceId, documentType, fetchDate } = record; + + if (record.isFirstRecord === undefined || record.isFirstRecord === null) { + record.isFirstRecord = await this.#isFirstRecord(serviceId, documentType); + } + const { message, content, fileExtension } = await this.#toPersistence(record); + + const filePath = await this.#writeFile({ serviceId, documentType, content, fileExtension }); + const sha = await this.#commit({ filePath, message, date: fetchDate }); + + if (!sha) { + return Object(null); + } + + record.id = sha; + + return record; + } + + finalize() { + if (!this.needsPublication) { + return; + } + + return this.git.pushChanges(); + } + + async findLatest(serviceId, documentType) { + const commit = await this.git.getCommit([`${serviceId}/${documentType}.*`]); + + return this.#toDomain(commit); + } + + async findById(recordId) { + const commit = await this.git.getCommit([recordId]); + + return this.#toDomain(commit); + } + + async findAll() { + return Promise.all((await this.#getCommits()).map(commit => this.#toDomain(commit, { deferContentLoading: true }))); + } + + async count() { + return (await this.git.log([ + `--grep=${DataMapper.COMMIT_MESSAGE_PREFIX.startTracking}`, + `--grep=${DataMapper.COMMIT_MESSAGE_PREFIX.refilter}`, + `--grep=${DataMapper.COMMIT_MESSAGE_PREFIX.update}`, + ])).length; + } + + async* iterate() { + const commits = await this.#getCommits(); + + for (const commit of commits) { + yield this.#toDomain(commit); + } + } + + async removeAll() { + const files = await fs.readdir(this.path, { withFileTypes: true }); + const promises = files.map(file => { + const filePath = path.join(this.path, file.name); + + if (file.isDirectory()) { + return fs.rm(filePath, { recursive: true }); + } + + return fs.unlink(filePath); + }); + + await Promise.all(promises); + + return this.initialize(); + } + + async loadRecordContent(record) { + const relativeFilePath = `${record.serviceId}/${record.documentType}.${mime.getExtension(record.mimeType)}`; + + if (record.mimeType != PDF_MIME_TYPE) { + record.content = await this.git.show(`${record.id}:${relativeFilePath}`); + + return; + } + + // In case of PDF files, `git show` cannot be used as it converts PDF binary into strings that do not retain the original binary representation + // It is impossible to restore the original binary data from the resulting string + let pdfBuffer; + + try { + await this.git.restore(relativeFilePath, record.id); // Temporarily restore the PDF file to a specific commit + pdfBuffer = await fs.readFile(`${this.path}/${relativeFilePath}`); // …read the content + } finally { + await this.git.restore(relativeFilePath, 'HEAD'); // …and finally restore the file to its most recent state + } + + record.content = pdfBuffer; + } + + async #getCommits() { + return (await this.git.listCommits()) + .filter(({ message }) => message.match(DataMapper.COMMIT_MESSAGE_PREFIXES_REGEXP)) // Skip commits which are not a document record (README, LICENSE…) + .sort((commitA, commitB) => new Date(commitA.date) - new Date(commitB.date)); // Make sure that the commits are sorted in ascending chronological order + } + + async #writeFile({ serviceId, documentType, content, fileExtension }) { + const directory = `${this.path}/${serviceId}`; + + if (!fsApi.existsSync(directory)) { + await fs.mkdir(directory, { recursive: true }); + } + + const filePath = this.#getPathFor(serviceId, documentType, fileExtension); + + await fs.writeFile(filePath, content); + + return filePath; + } + + async #commit({ filePath, message, date }) { + try { + await this.git.add(filePath); + + return await this.git.commit({ filePath, message, date }); + } catch (error) { + throw new Error(`Could not commit ${filePath} with message "${message}" due to error: "${error}"`); + } + } + + #getPathFor(serviceId, documentType, fileExtension) { + return `${this.path}/${serviceId}/${documentType}.${fileExtension}`; + } + + #isTracked(serviceId, documentType) { + const filePath = this.#getPathFor(serviceId, documentType, '*'); + + return this.git.isTracked(filePath); + } + + async #isFirstRecord(serviceId, documentType) { + return !await this.#isTracked(serviceId, documentType); + } + + async #toDomain(commit, { deferContentLoading } = {}) { + if (!commit) { + return Object(null); + } + + const record = DataMapper.toDomain(commit); + + if (deferContentLoading) { + return record; + } + + await this.loadRecordContent(record); + + return record; + } + + async #toPersistence(record) { + if (record.content === undefined || record.content === null) { + await this.loadRecordContent(record); + } + + return DataMapper.toPersistence(record, this.prefixMessageToSnapshotId); + } +} diff --git a/src/storage-adapters/git/index.test.js b/src/archivist/recorder/repositories/git/index.test.js similarity index 56% rename from src/storage-adapters/git/index.test.js rename to src/archivist/recorder/repositories/git/index.test.js index 3cd5c7604..2ff742f14 100644 --- a/src/storage-adapters/git/index.test.js +++ b/src/archivist/recorder/repositories/git/index.test.js @@ -6,14 +6,16 @@ import chai from 'chai'; import config from 'config'; import mime from 'mime'; +import Record from '../../record.js'; + import Git from './git.js'; -import GitAdapter from './index.js'; +import GitRepository from './index.js'; const { expect } = chai; const __dirname = path.dirname(fileURLToPath(import.meta.url)); -const RECORDER_PATH = path.resolve(__dirname, '../../../', config.get('recorder.snapshots.storage.git.path')); +const RECORDER_PATH = path.resolve(__dirname, '../../../', config.get('recorder.versions.storage.git.path')); const SERVICE_PROVIDER_ID = 'test_service'; const DOCUMENT_TYPE = 'Terms of Service'; @@ -21,130 +23,38 @@ const CONTENT = 'ToS fixture data with UTF-8 çhãràčtęrs'; const EXPECTED_FILE_PATH = `${RECORDER_PATH}/${SERVICE_PROVIDER_ID}/${DOCUMENT_TYPE}.html`; const EXPECTED_PDF_FILE_PATH = EXPECTED_FILE_PATH.replace('html', 'pdf'); const FETCH_DATE = new Date('2000-01-01T12:00:00.000Z'); -const SNAPSHOT_ID = 'snapshot_id'; +const FETCH_DATE_LATER = new Date('2000-01-02T12:00:00.000Z'); +const FETCH_DATE_EARLIER = new Date('2000-01-01T06:00:00.000Z'); +const SNAPSHOT_ID = '513fadb2ae415c87747047e33287805d59e2dd55'; const MIME_TYPE = 'text/html'; -const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'), { encoding: 'utf8' }); +const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/terms.pdf'), { encoding: 'utf8' }); const PDF_MIME_TYPE = 'application/pdf'; let git; -describe('GitAdapter', () => { +describe('GitRepository', () => { let subject; before(async () => { git = new Git({ path: RECORDER_PATH, author: { - name: config.get('recorder.snapshots.storage.git.author.name'), - email: config.get('recorder.snapshots.storage.git.author.email'), + name: config.get('recorder.versions.storage.git.author.name'), + email: config.get('recorder.versions.storage.git.author.email'), }, }); await git.initialize(); - subject = new GitAdapter({ - ...config.get('recorder.snapshots.storage.git'), + subject = new GitRepository({ + ...config.get('recorder.versions.storage.git'), path: RECORDER_PATH, }); return subject.initialize(); }); - describe('#_save', () => { - context('when service directory already exists', () => { - before(async () => subject._save({ - serviceId: SERVICE_PROVIDER_ID, - documentType: DOCUMENT_TYPE, - content: CONTENT, - fileExtension: 'html', - })); - - after(async () => subject._removeAllRecords()); - - it('creates a file for the given service', () => { - expect(fs.readFileSync(EXPECTED_FILE_PATH, { encoding: 'utf8' })).to.equal(CONTENT); - }); - }); - - context('when service directory does not already exist', () => { - const NEW_SERVICE_ID = 'test_not_existing_service'; - const NEW_SERVICE_EXPECTED_FILE_PATH = `${RECORDER_PATH}/${NEW_SERVICE_ID}/${DOCUMENT_TYPE}.html`; - - after(async () => subject._removeAllRecords()); - - it('creates a directory and file for the given service', async () => { - await subject._save({ - serviceId: NEW_SERVICE_ID, - documentType: DOCUMENT_TYPE, - content: CONTENT, - fileExtension: 'html', - }); - - expect(fs.readFileSync(NEW_SERVICE_EXPECTED_FILE_PATH, { encoding: 'utf8' })).to.equal(CONTENT); - }); - }); - }); - - describe('#_commit', () => { - const COMMIT_MESSAGE = 'Message to check if the commit message is properly saved'; - let id; - let commit; - - before(async () => { - await subject._save({ - serviceId: SERVICE_PROVIDER_ID, - documentType: DOCUMENT_TYPE, - content: CONTENT, - fileExtension: 'html', - }); - - id = await subject._commit(EXPECTED_FILE_PATH, COMMIT_MESSAGE); - - ([commit] = await git.log()); - }); - - after(async () => subject._removeAllRecords()); - - it('returns the id of the commit', () => { - expect(commit.hash).to.include(id); - }); - - it('properly saves the commit message', () => { - expect(commit.message).to.equal(COMMIT_MESSAGE); - }); - }); - - describe('#_getPathFor', () => { - it('returns the file path with given extension for the given service provider’s document type', () => { - expect(subject._getPathFor(SERVICE_PROVIDER_ID, DOCUMENT_TYPE, 'pdf')).to.equal(EXPECTED_PDF_FILE_PATH); - }); - }); - - describe('#_isTracked', () => { - after(async () => subject._removeAllRecords()); - - context('when the file does not exists', () => { - it('returns false', async () => { - expect(await subject._isTracked(SERVICE_PROVIDER_ID, DOCUMENT_TYPE)).to.be.false; - }); - }); - - context('when the file already exists', () => { - before(async () => { - await subject.record({ - serviceId: SERVICE_PROVIDER_ID, - documentType: DOCUMENT_TYPE, - content: CONTENT, - }); - }); - - it('returns true', async () => { - expect(await subject._isTracked(SERVICE_PROVIDER_ID, DOCUMENT_TYPE)).to.be.true; - }); - }); - }); - - describe('#record', () => { + describe('#save', () => { let id; let commit; let isFirstRecord; @@ -155,21 +65,21 @@ describe('GitAdapter', () => { before(async () => { numberOfRecordsBefore = (await git.log()).length; - ({ id, isFirstRecord } = await subject.record({ + ({ id, isFirstRecord } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, mimeType: MIME_TYPE, - })); + }))); numberOfRecordsAfter = (await git.log()).length; ([commit] = await git.log()); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -204,7 +114,7 @@ describe('GitAdapter', () => { expect(new Date(commit.date).getTime()).to.equal(FETCH_DATE.getTime()); }); - it('stores the mime type', () => { + it('stores the MIME type', () => { expect(mime.getType(EXPECTED_FILE_PATH)).to.equal(MIME_TYPE); }); @@ -218,29 +128,31 @@ describe('GitAdapter', () => { const UPDATED_CONTENT = `${CONTENT} updated`; before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, - }); + mimeType: MIME_TYPE, + fetchDate: FETCH_DATE, + })); numberOfRecordsBefore = (await git.log()).length; - ({ id, isFirstRecord } = await subject.record({ + ({ id, isFirstRecord } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: UPDATED_CONTENT, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, mimeType: MIME_TYPE, - })); + }))); numberOfRecordsAfter = (await git.log()).length; ([commit] = await git.log()); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -257,24 +169,28 @@ describe('GitAdapter', () => { context('when the content has not changed', () => { before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, - }); + mimeType: MIME_TYPE, + fetchDate: FETCH_DATE, + })); numberOfRecordsBefore = (await git.log()).length; - ({ id, isFirstRecord } = await subject.record({ + ({ id, isFirstRecord } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, - })); + mimeType: MIME_TYPE, + fetchDate: FETCH_DATE, + }))); numberOfRecordsAfter = (await git.log()).length; }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('does not save the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore); @@ -289,15 +205,17 @@ describe('GitAdapter', () => { const REFILTERED_CONTENT = `${CONTENT} refiltered`; before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, - }); // A refilter cannot be the first record + mimeType: MIME_TYPE, + fetchDate: FETCH_DATE_EARLIER, + })); // A refilter cannot be the first record numberOfRecordsBefore = (await git.log()).length; - ({ id, isFirstRecord } = await subject.record({ + ({ id, isFirstRecord } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: REFILTERED_CONTENT, @@ -305,14 +223,14 @@ describe('GitAdapter', () => { isRefilter: true, snapshotId: SNAPSHOT_ID, mimeType: MIME_TYPE, - })); + }))); numberOfRecordsAfter = (await git.log()).length; ([commit] = await git.log()); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -331,21 +249,21 @@ describe('GitAdapter', () => { before(async () => { numberOfRecordsBefore = (await git.log()).length; - ({ id, isFirstRecord } = await subject.record({ + ({ id, isFirstRecord } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: PDF_CONTENT, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, mimeType: PDF_MIME_TYPE, - })); + }))); numberOfRecordsAfter = (await git.log()).length; ([commit] = await git.log()); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -359,13 +277,170 @@ describe('GitAdapter', () => { expect(fs.readFileSync(EXPECTED_PDF_FILE_PATH, { encoding: 'utf8' })).to.equal(PDF_CONTENT); }); - it('stores the mime type', () => { + it('stores the MIME type', () => { expect(mime.getType(EXPECTED_PDF_FILE_PATH)).to.equal(PDF_MIME_TYPE); }); }); }); - describe('#getLatestRecord', () => { + describe('#findById', () => { + let record; + let id; + + before(async () => { + ({ id } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + }))); + + (record = await subject.findById(id)); + }); + + after(async () => subject.removeAll()); + + it('returns the record id', () => { + expect(record.id).to.include(id); + }); + + it('returns a boolean to know if it is the first record', () => { + expect(record.isFirstRecord).to.be.true; + }); + + it('returns the service id', () => { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + }); + + it('returns the document type', () => { + expect(record.documentType).to.equal(DOCUMENT_TYPE); + }); + + it('returns the content', async () => { + expect(record.content).to.equal(CONTENT); + }); + + it('stores the fetch date', () => { + expect(new Date(record.fetchDate).getTime()).to.equal(FETCH_DATE.getTime()); + }); + + it('stores the MIME type', () => { + expect(record.mimeType).to.equal(MIME_TYPE); + }); + + it('stores the snapshot ID', () => { + expect(record.snapshotId).to.equal(SNAPSHOT_ID); + }); + + context('when requested record does not exist', () => { + it('returns an empty object', async () => { + expect(await subject.findById('inexistantID')).to.deep.equal({}); + }); + }); + }); + + describe('#findAll', () => { + let records; + const expectedIds = []; + + before(async () => { + const { id: id1 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated 2`, + isRefilter: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id3); + + (records = await subject.findAll()); + }); + + after(async () => subject.removeAll()); + + it('returns all records', () => { + expect(records.length).to.equal(3); + }); + + it('returns Record objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Record); + } + }); + + it('returns records in ascending order', async () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + }); + }); + + describe('#count', () => { + let count; + + before(async () => { + await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated 2`, + isRefilter: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + (count = await subject.count()); + }); + + after(async () => subject.removeAll()); + + it('returns the proper count', async () => { + expect(count).to.equal(3); + }); + }); + + describe('#findLatest', () => { context('when there are records for the given service', () => { let lastSnapshotId; let latestRecord; @@ -374,30 +449,32 @@ describe('GitAdapter', () => { const UPDATED_FILE_CONTENT = `${CONTENT} (with additional content to trigger a record)`; before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, mimeType: MIME_TYPE, - }); + fetchDate: FETCH_DATE_EARLIER, + })); - ({ id: lastSnapshotId } = await subject.record({ + ({ id: lastSnapshotId } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: UPDATED_FILE_CONTENT, mimeType: MIME_TYPE, - })); + fetchDate: FETCH_DATE, + }))); - latestRecord = await subject.getLatestRecord(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); + latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('returns the latest record id', () => { expect(latestRecord.id).to.include(lastSnapshotId); }); - it('returns the latest record content', () => { + it('returns the latest record content', async () => { expect(latestRecord.content.toString('utf8')).to.equal(UPDATED_FILE_CONTENT); }); @@ -408,23 +485,24 @@ describe('GitAdapter', () => { context('with PDF document', () => { before(async () => { - ({ id: lastSnapshotId } = await subject.record({ + ({ id: lastSnapshotId } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: PDF_CONTENT, mimeType: PDF_MIME_TYPE, - })); + fetchDate: FETCH_DATE, + }))); - latestRecord = await subject.getLatestRecord(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); + latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('returns the latest record id', () => { expect(latestRecord.id).to.include(lastSnapshotId); }); - it('returns the latest record content', () => { + it('returns the latest record content', async () => { expect(latestRecord.content.toString('utf8')).to.equal(PDF_CONTENT); }); @@ -438,64 +516,69 @@ describe('GitAdapter', () => { let latestRecord; before(async () => { - latestRecord = await subject.getLatestRecord(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); - }); - - it('returns no id', () => { - expect(latestRecord.id).to.not.be.ok; + latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); }); - it('returns no content', () => { - expect(latestRecord.content).to.not.be.ok; - }); - - it('returns no mime type', () => { - expect(latestRecord.mimeType).to.not.be.ok; + it('returns an empty object', async () => { + expect(latestRecord).to.deep.equal({}); }); }); }); describe('#iterate', () => { + const expectedIds = []; + const ids = []; + const fetchDates = []; + before(async () => { - await subject.record({ + const { id: id1 } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, mimeType: MIME_TYPE, - }); + })); + + expectedIds.push(id1); - await subject.record({ + const { id: id2 } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: `${CONTENT} - updated`, - fetchDate: FETCH_DATE, + fetchDate: FETCH_DATE_LATER, snapshotId: SNAPSHOT_ID, mimeType: MIME_TYPE, - }); + })); - await subject.record({ + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: `${CONTENT} - updated 2`, isRefilter: true, - fetchDate: FETCH_DATE, + fetchDate: FETCH_DATE_EARLIER, snapshotId: SNAPSHOT_ID, mimeType: MIME_TYPE, - }); - }); - - after(async () => subject._removeAllRecords()); + })); - it('iterates through all records', async () => { - const result = []; + expectedIds.push(id3); for await (const record of subject.iterate()) { - result.push(record.content); + ids.push(record.id); + fetchDates.push(record.fetchDate); } + }); + + after(async () => subject.removeAll()); + + it('iterates through all records', async () => { + expect(ids).to.have.members(expectedIds); + }); - expect(result).to.deep.equal([ `${CONTENT} - updated 2`, `${CONTENT} - updated`, CONTENT ]); + it('iterates in ascending order', async () => { + expect(fetchDates).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); }); }); }); diff --git a/src/archivist/recorder/repositories/interface.js b/src/archivist/recorder/repositories/interface.js new file mode 100644 index 000000000..a2c54354a --- /dev/null +++ b/src/archivist/recorder/repositories/interface.js @@ -0,0 +1,106 @@ +/** + * Interface for classes that model a collection of domain objects with querying capabilities + * @see {@link https://martinfowler.com/eaaCatalog/repository.html|Repository} + * @interface + */ +export default class RepositoryInterface { + /** + * [Optional] Initialize repository + * Override this method if the repository needs some asynchronous initialization code (open database connection and create collections, initialize Git…) + * + * @returns {Promise} Promise that will be resolved with the current repository instance + */ + async initialize() { + return this; + } + + /** + * [Optional] Finalize repository + * Override this method if the repository needs some asynchronous code to properly close the repository (close database connection, push changes on Git remote…) + * + * @returns {Promise} Promise that will be resolved with the current repository instance + */ + async finalize() { + return this; + } + + /** + * Persist the given record if it does not already exist in repository + * + * @param {Record} record - Record to persist + * @returns {Promise} Promise that will be resolved with the given record when it has been persisted + */ + async save(record) { + throw new Error(`#save method is not implemented in ${this.constructor.name}`); + } + + /** + * Find the most recent record that matches the given service ID and document type + * + * @param {string} serviceId - Service ID of record to find + * @param {string} documentType - Document type of record to find + * @returns {Promise} Promise that will be resolved with the found record or an empty object if none match the given criteria + */ + async findLatest(serviceId, documentType) { + throw new Error(`#findLatest method is not implemented in ${this.constructor.name}`); + } + + /** + * Find the record that matches the given record ID + * + * @param {string} recordId - Record ID of the record to find + * @returns {Promise} Promise that will be resolved with the found record or an empty object if none match the given ID + */ + async findById(recordId) { + throw new Error(`#findById method is not implemented in ${this.constructor.name}`); + } + + /** + * Find all records + * For performance reasons, the content of the records will not be loaded by default. Use #loadRecordContent to load the content of individual records + * + * @see RepositoryInterface#loadRecordContent + * @returns {Promise>} Promise that will be resolved with an array of all records + */ + async findAll() { + throw new Error(`#findAll method is not implemented in ${this.constructor.name}`); + } + + /** + * Count the total number of records in the repository + * For performance reasons, use this method rather than counting the number of entries returned by #findAll if you only need the size of a repository + * + * @returns {Promise} Promise that will be resolved with the total number of records + */ + async count() { + throw new Error(`#count method is not implemented in ${this.constructor.name}`); + } + + /** + * Iterate over all records in the repository, from oldest to most recent + * + * @yields {Record} + */ + async* iterate() { + throw new Error(`#iterate method is not implemented in ${this.constructor.name}`); + } + + /** + * Remove all records + * + * @returns {Promise} Promise that will be resolved when all records are removed + */ + async removeAll() { + throw new Error(`#removeAll method is not implemented in ${this.constructor.name}`); + } + + /** + * Load content of the given record + * + * @param {Record} record - Record of which to populate content + * @returns {Promise} Promise that will be resolved with the given record when its content has been loaded + */ + async loadRecordContent(record) { + throw new Error(`#loadRecordContent method is not implemented in ${this.constructor.name}`); + } +} diff --git a/src/archivist/recorder/repositories/mongo/dataMapper.js b/src/archivist/recorder/repositories/mongo/dataMapper.js new file mode 100644 index 000000000..ca05c2a4b --- /dev/null +++ b/src/archivist/recorder/repositories/mongo/dataMapper.js @@ -0,0 +1,31 @@ +import { ObjectId } from 'mongodb'; + +import Record from '../../record.js'; + +export function toPersistence(record) { + const documentFields = Object.fromEntries(Object.entries(record)); + + if (documentFields.snapshotId) { + documentFields.snapshotId = new ObjectId(record.snapshotId); + } + + documentFields.content = record.content; + documentFields.created_at = new Date(); + + return documentFields; +} + +export function toDomain(document) { + const { _id, serviceId, documentType, fetchDate, mimeType, isRefilter, isFirstRecord, snapshotId } = document; + + return new Record({ + id: _id.toString(), + serviceId, + documentType, + mimeType, + fetchDate: new Date(fetchDate), + isFirstRecord: Boolean(isFirstRecord), + isRefilter: Boolean(isRefilter), + snapshotId: snapshotId && snapshotId.toString(), + }); +} diff --git a/src/archivist/recorder/repositories/mongo/index.js b/src/archivist/recorder/repositories/mongo/index.js new file mode 100644 index 000000000..cec3a1b9c --- /dev/null +++ b/src/archivist/recorder/repositories/mongo/index.js @@ -0,0 +1,123 @@ +/** + * This module is the boundary beyond which the usage of MongoDB is abstracted. + * Object IDs are used as opaque unique IDs. + */ + +import { MongoClient, ObjectId, Binary } from 'mongodb'; + +import RepositoryInterface from '../interface.js'; + +import * as DataMapper from './dataMapper.js'; + +export default class MongoRepository extends RepositoryInterface { + constructor({ database: databaseName, collection: collectionName, connectionURI }) { + super(); + const client = new MongoClient(connectionURI); + + this.databaseName = databaseName; + this.collectionName = collectionName; + this.client = client; + } + + async initialize() { + await this.client.connect(); + const db = this.client.db(this.databaseName); + + this.collection = db.collection(this.collectionName); + + return this; + } + + async finalize() { + return this.client.close(); + } + + async save(record) { + const { serviceId, documentType } = record; + + if (record.isFirstRecord === undefined || record.isFirstRecord === null) { + record.isFirstRecord = !await this.collection.findOne({ serviceId, documentType }); + } + + const documentFields = await this.#toPersistence(record); + + const { content: previousRecordContent } = await this.findLatest(serviceId, documentType); + + if (previousRecordContent == documentFields.content) { + return Object(null); + } + + const insertResult = await this.collection.insertOne(documentFields); + + record.id = insertResult.insertedId.toString(); + + return record; + } + + async findLatest(serviceId, documentType) { + const [mongoDocument] = await this.collection.find({ serviceId, documentType }).limit(1).sort({ fetchDate: -1 }).toArray(); // `findOne` doesn't support the `sort` method, so even for only one document use `find` + + return this.#toDomain(mongoDocument); + } + + async findById(recordId) { + const mongoDocument = await this.collection.findOne({ _id: new ObjectId(recordId) }); + + return this.#toDomain(mongoDocument); + } + + async findAll() { + return Promise.all((await this.collection.find().project({ content: 0 }).sort({ fetchDate: 1 }).toArray()) + .map(mongoDocument => this.#toDomain(mongoDocument, { deferContentLoading: true }))); + } + + async count() { + return this.collection.find().count(); + } + + async* iterate() { + const cursor = this.collection.find().sort({ fetchDate: 1 }); + + /* eslint-disable no-await-in-loop */ + while (await cursor.hasNext()) { + const mongoDocument = await cursor.next(); + + yield this.#toDomain(mongoDocument); + } + /* eslint-enable no-await-in-loop */ + } + + async removeAll() { + return this.collection.deleteMany(); + } + + async loadRecordContent(record) { + const { content } = await this.collection.findOne({ _id: new ObjectId(record.id) }, { projection: { content: 1 } }); + + record.content = content instanceof Binary ? content.buffer : content; + } + + async #toDomain(document, { deferContentLoading } = {}) { + if (!document) { + return Object(null); + } + + const record = DataMapper.toDomain(document); + + if (deferContentLoading) { + return record; + } + + await this.loadRecordContent(record); + + return record; + } + + async #toPersistence(record) { + if (record.content === undefined || record.content === null) { + await this.repository.loadRecordContent(record); + } + + return DataMapper.toPersistence(record); + } +} diff --git a/src/storage-adapters/mongo/index.test.js b/src/archivist/recorder/repositories/mongo/index.test.js similarity index 55% rename from src/storage-adapters/mongo/index.test.js rename to src/archivist/recorder/repositories/mongo/index.test.js index c3ac45d72..1a703b710 100644 --- a/src/storage-adapters/mongo/index.test.js +++ b/src/archivist/recorder/repositories/mongo/index.test.js @@ -6,7 +6,9 @@ import chai from 'chai'; import config from 'config'; import { MongoClient } from 'mongodb'; -import MongoAdapter from './index.js'; +import Record from '../../record.js'; + +import MongoRepository from './index.js'; const { expect } = chai; const __dirname = path.dirname(fileURLToPath(import.meta.url)); @@ -20,18 +22,19 @@ const CONTENT = 'ToS fixture data with UTF-8 çhãràčtęrs'; const MIME_TYPE = 'text/html'; const FETCH_DATE = new Date('2000-01-01T12:00:00.000Z'); const FETCH_DATE_LATER = new Date('2000-01-02T12:00:00.000Z'); +const FETCH_DATE_EARLIER = new Date('2000-01-01T06:00:00.000Z'); const SNAPSHOT_ID = '61af86dc5ff5caa74ae926ad'; -const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/terms.pdf'), { encoding: 'utf8' }); -const UPDATED_PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../test/fixtures/termsModified.pdf'), { encoding: 'utf8' }); +const PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/terms.pdf')); +const UPDATED_PDF_CONTENT = fs.readFileSync(path.resolve(__dirname, '../../../../../test/fixtures/termsModified.pdf')); const PDF_MIME_TYPE = 'application/pdf'; let collection; -describe('MongoAdapter', () => { +describe('MongoRepository', () => { let subject; before(async () => { - subject = new MongoAdapter(config.get('recorder.snapshots.storage.mongo')); + subject = new MongoRepository(config.get('recorder.snapshots.storage.mongo')); await subject.initialize(); await client.connect(); const db = client.db(config.get('recorder.snapshots.storage.mongo.database')); @@ -39,7 +42,7 @@ describe('MongoAdapter', () => { collection = db.collection(config.get('recorder.snapshots.storage.mongo.collection')); }); - describe('#record', () => { + describe('#save', () => { let record; let mongoDocument; let numberOfRecordsBefore; @@ -52,14 +55,14 @@ describe('MongoAdapter', () => { documentType: DOCUMENT_TYPE, }).count(); - (record = await subject.record({ + (record = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, mimeType: MIME_TYPE, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, - })); + }))); numberOfRecordsAfter = await collection.find({ serviceId: SERVICE_PROVIDER_ID, @@ -72,7 +75,7 @@ describe('MongoAdapter', () => { })); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -107,7 +110,7 @@ describe('MongoAdapter', () => { expect(new Date(mongoDocument.fetchDate).getTime()).to.equal(FETCH_DATE.getTime()); }); - it('stores the mime type', () => { + it('stores the MIME type', () => { expect(mongoDocument.mimeType).to.equal(MIME_TYPE); }); @@ -121,28 +124,28 @@ describe('MongoAdapter', () => { const UPDATED_CONTENT = `${CONTENT} updated`; before(async () => { - (record = await subject.record({ + (record = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, mimeType: MIME_TYPE, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, - })); + }))); numberOfRecordsBefore = await collection.find({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, }).count(); - (record = await subject.record({ + (record = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: UPDATED_CONTENT, mimeType: MIME_TYPE, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, - })); + }))); numberOfRecordsAfter = await collection.find({ serviceId: SERVICE_PROVIDER_ID, @@ -155,7 +158,7 @@ describe('MongoAdapter', () => { }).limit(1).sort({ created_at: -1 }).toArray()); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -172,26 +175,26 @@ describe('MongoAdapter', () => { context('when the content has not changed', () => { before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, mimeType: MIME_TYPE, fetchDate: FETCH_DATE, - }); + })); numberOfRecordsBefore = await collection.find({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, }).count(); - (record = await subject.record({ + (record = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, mimeType: MIME_TYPE, fetchDate: FETCH_DATE_LATER, - })); + }))); numberOfRecordsAfter = await collection.find({ serviceId: SERVICE_PROVIDER_ID, @@ -199,7 +202,7 @@ describe('MongoAdapter', () => { }).count(); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('does not save the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore); @@ -214,18 +217,20 @@ describe('MongoAdapter', () => { const REFILTERED_CONTENT = `${CONTENT} refiltered`; before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, - }); // A refilter cannot be the first record + mimeType: MIME_TYPE, + fetchDate: FETCH_DATE_EARLIER, + })); // A refilter cannot be the first record numberOfRecordsBefore = await collection.find({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, }).count(); - (record = await subject.record({ + (record = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: REFILTERED_CONTENT, @@ -233,7 +238,7 @@ describe('MongoAdapter', () => { fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, isRefilter: true, - })); + }))); numberOfRecordsAfter = await collection.find({ serviceId: SERVICE_PROVIDER_ID, @@ -246,7 +251,7 @@ describe('MongoAdapter', () => { }).limit(1).sort({ created_at: -1 }).toArray()); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -270,14 +275,14 @@ describe('MongoAdapter', () => { mimeType: PDF_MIME_TYPE, }).count(); - (record = await subject.record({ + (record = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: PDF_CONTENT, mimeType: PDF_MIME_TYPE, fetchDate: FETCH_DATE, snapshotId: SNAPSHOT_ID, - })); + }))); numberOfRecordsAfter = await collection.find({ serviceId: SERVICE_PROVIDER_ID, @@ -290,7 +295,7 @@ describe('MongoAdapter', () => { })); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('saves the record', () => { expect(numberOfRecordsAfter).to.equal(numberOfRecordsBefore + 1); @@ -300,17 +305,176 @@ describe('MongoAdapter', () => { expect(mongoDocument._id.toString()).to.equal(record.id); }); - it('stores the proper content', () => { - expect(mongoDocument.content).to.equal(PDF_CONTENT); + it('stores the proper content', async () => { + const isSameContent = Buffer.compare(mongoDocument.content.buffer, PDF_CONTENT) == 0; + + expect(isSameContent).to.be.true; }); - it('stores the mime type', () => { + it('stores the MIME type', () => { expect(mongoDocument.mimeType).to.equal(PDF_MIME_TYPE); }); }); }); - describe('#getLatestRecord', () => { + describe('#findById', () => { + let record; + let id; + + before(async () => { + ({ id } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + }))); + + (record = await subject.findById(id)); + }); + + after(async () => subject.removeAll()); + + it('returns the record id', () => { + expect(record.id).to.include(id); + }); + + it('returns a boolean to know if it is the first record', () => { + expect(record.isFirstRecord).to.be.true; + }); + + it('returns the service id', () => { + expect(record.serviceId).to.equal(SERVICE_PROVIDER_ID); + }); + + it('returns the document type', () => { + expect(record.documentType).to.equal(DOCUMENT_TYPE); + }); + + it('returns the content', async () => { + expect(record.content).to.equal(CONTENT); + }); + + it('stores the fetch date', () => { + expect(new Date(record.fetchDate).getTime()).to.equal(FETCH_DATE.getTime()); + }); + + it('stores the MIME type', () => { + expect(record.mimeType).to.equal(MIME_TYPE); + }); + + it('stores the snapshot ID', () => { + expect(record.snapshotId).to.equal(SNAPSHOT_ID); + }); + + context('when requested record does not exist', () => { + it('returns an empty object', async () => { + expect(await subject.findById('inexistantID')).to.deep.equal({}); + }); + }); + }); + + describe('#findAll', () => { + let records; + const expectedIds = []; + + before(async () => { + const { id: id1 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated 2`, + isRefilter: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id3); + + (records = await subject.findAll()); + }); + + after(async () => subject.removeAll()); + + it('returns all records', () => { + expect(records.length).to.equal(3); + }); + + it('returns Record objects', () => { + for (const record of records) { + expect(record).to.be.an.instanceof(Record); + } + }); + + it('returns records in ascending order', async () => { + expect(records.map(record => record.fetchDate)).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); + }); + }); + + describe('#count', () => { + let count; + + before(async () => { + await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated 2`, + isRefilter: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + (count = await subject.count()); + }); + + after(async () => subject.removeAll()); + + it('returns the proper count', async () => { + expect(count).to.equal(3); + }); + }); + + describe('#findLatest', () => { context('when there are records for the given service', () => { let lastSnapshotId; let latestRecord; @@ -319,35 +483,36 @@ describe('MongoAdapter', () => { const UPDATED_CONTENT = `${CONTENT} (with additional content to trigger a record)`; before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: CONTENT, fetchDate: FETCH_DATE, - }); + mimeType: MIME_TYPE, + })); - ({ id: lastSnapshotId } = await subject.record({ + ({ id: lastSnapshotId } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: UPDATED_CONTENT, mimeType: MIME_TYPE, fetchDate: FETCH_DATE_LATER, - })); + }))); - latestRecord = await subject.getLatestRecord( + latestRecord = await subject.findLatest( SERVICE_PROVIDER_ID, DOCUMENT_TYPE, ); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('returns the latest record id', () => { expect(latestRecord.id).to.include(lastSnapshotId); }); - it('returns the latest record content', () => { - expect(latestRecord.content.toString('utf8')).to.equal(UPDATED_CONTENT); + it('returns the latest record content', async () => { + expect((await latestRecord.content).toString('utf8')).to.equal(UPDATED_CONTENT); }); it('returns the latest record mime type', () => { @@ -357,33 +522,35 @@ describe('MongoAdapter', () => { context('with PDF document', () => { before(async () => { - await subject.record({ + await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: PDF_CONTENT, mimeType: PDF_MIME_TYPE, fetchDate: FETCH_DATE, - }); + })); - ({ id: lastSnapshotId } = await subject.record({ + ({ id: lastSnapshotId } = await subject.save(new Record({ serviceId: SERVICE_PROVIDER_ID, documentType: DOCUMENT_TYPE, content: UPDATED_PDF_CONTENT, mimeType: PDF_MIME_TYPE, fetchDate: FETCH_DATE_LATER, - })); + }))); - latestRecord = await subject.getLatestRecord(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); + latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); }); - after(async () => subject._removeAllRecords()); + after(async () => subject.removeAll()); it('returns the latest record id', () => { expect(latestRecord.id).to.include(lastSnapshotId); }); - it('returns the latest record content', () => { - expect(latestRecord.content).to.equal(UPDATED_PDF_CONTENT); + it('returns the latest record content', async () => { + const isSameContent = Buffer.compare(latestRecord.content, UPDATED_PDF_CONTENT) == 0; + + expect(isSameContent).to.be.true; }); it('returns the latest record mime type', () => { @@ -396,20 +563,69 @@ describe('MongoAdapter', () => { let latestRecord; before(async () => { - latestRecord = await subject.getLatestRecord(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); + latestRecord = await subject.findLatest(SERVICE_PROVIDER_ID, DOCUMENT_TYPE); }); - it('returns no id', () => { - expect(latestRecord.id).to.not.be.ok; + it('returns an empty object', async () => { + expect(latestRecord).to.deep.equal({}); }); + }); + }); - it('returns no content', () => { - expect(latestRecord.content).to.not.be.ok; - }); + describe('#iterate', () => { + const expectedIds = []; + const ids = []; + const fetchDates = []; + + before(async () => { + const { id: id1 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: CONTENT, + fetchDate: FETCH_DATE, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id1); + + const { id: id2 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated`, + fetchDate: FETCH_DATE_LATER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id2); + + const { id: id3 } = await subject.save(new Record({ + serviceId: SERVICE_PROVIDER_ID, + documentType: DOCUMENT_TYPE, + content: `${CONTENT} - updated 2`, + isRefilter: true, + fetchDate: FETCH_DATE_EARLIER, + snapshotId: SNAPSHOT_ID, + mimeType: MIME_TYPE, + })); + + expectedIds.push(id3); + + for await (const record of subject.iterate()) { + ids.push(record.id); + fetchDates.push(record.fetchDate); + } + }); - it('returns no mime type', () => { - expect(latestRecord.mimeType).to.not.be.ok; - }); + after(async () => subject.removeAll()); + + it('iterates through all records', async () => { + expect(ids).to.have.members(expectedIds); + }); + + it('iterates in ascending order', async () => { + expect(fetchDates).to.deep.equal([ FETCH_DATE_EARLIER, FETCH_DATE, FETCH_DATE_LATER ]); }); }); }); diff --git a/src/index.js b/src/index.js deleted file mode 100644 index 6a4cce8aa..000000000 --- a/src/index.js +++ /dev/null @@ -1,37 +0,0 @@ -import path from 'path'; -import { fileURLToPath } from 'url'; - -import config from 'config'; - -import GitAdapter from './storage-adapters/git/index.js'; -import MongoAdapter from './storage-adapters/mongo/index.js'; - -const __dirname = path.dirname(fileURLToPath(import.meta.url)); - -export function instantiateVersionsStorageAdapter() { - return instantiateStorageAdapter('versions'); -} - -export function instantiateSnapshotsStorageAdapter() { - return instantiateStorageAdapter('snapshots'); -} - -function instantiateStorageAdapter(recordType) { - let result; - - switch (config.get(`recorder.${recordType}.storage.type`)) { - case 'git': - result = new GitAdapter({ - ...config.get(`recorder.${recordType}.storage.git`), - path: path.resolve(__dirname, '../', config.get(`recorder.${recordType}.storage.git.path`)), - }); - break; - case 'mongo': - result = new MongoAdapter(config.get(`recorder.${recordType}.storage.mongo`)); - break; - default: - throw new Error(`No configuration found for ${recordType} storage adapter`); - } - - return result; -} diff --git a/src/main.js b/src/main.js index c5bbb5a9f..42196fca5 100644 --- a/src/main.js +++ b/src/main.js @@ -1,3 +1,4 @@ +import config from 'config'; import cron from 'croner'; import Archivist from './archivist/index.js'; @@ -5,20 +6,13 @@ import logger from './logger/index.js'; import Notifier from './notifier/index.js'; import Tracker from './tracker/index.js'; -import { instantiateVersionsStorageAdapter, instantiateSnapshotsStorageAdapter } from './index.js'; - const args = process.argv.slice(2); const refilterOnly = args.includes('--refilter-only'); const schedule = args.includes('--schedule'); const extraArgs = args.filter(arg => !arg.startsWith('--')); (async function startOpenTermsArchive() { - const archivist = new Archivist({ - storage: { - versions: instantiateVersionsStorageAdapter(), - snapshots: instantiateSnapshotsStorageAdapter(), - }, - }); + const archivist = new Archivist({ recorderConfig: config.get('recorder') }); archivist.attach(logger); diff --git a/src/storage-adapters/git/index.js b/src/storage-adapters/git/index.js deleted file mode 100644 index d7b11e588..000000000 --- a/src/storage-adapters/git/index.js +++ /dev/null @@ -1,190 +0,0 @@ -/** - * This file is the boundary beyond which the usage of git is abstracted. - * Commit SHAs are used as opaque unique IDs. - */ - -import fsApi from 'fs'; -import path from 'path'; - -import mime from 'mime'; - -import Git from './git.js'; - -const fs = fsApi.promises; - -mime.define({ 'text/markdown': ['md'] }, true); // ensure extension for markdown files is `.md` and not `.markdown` - -export default class GitAdapter { - constructor({ path, author, publish, prefixMessageToSnapshotId }) { - this.path = path; - this.author = author; - this.needsPublication = publish; - this.prefixMessageToSnapshotId = prefixMessageToSnapshotId; - } - - async initialize() { - this.git = new Git({ path: this.path, author: this.author }); - - await this.git.initialize(); - - return this; - } - - async record({ serviceId, documentType, content, mimeType, fetchDate, isRefilter, snapshotId }) { - const isFirstRecord = await this._isFirstRecord(serviceId, documentType); - const message = this._generateCommitMessage({ serviceId, documentType, isRefilter, snapshotId, isFirstRecord }); - const fileExtension = mime.getExtension(mimeType); - const filePath = await this._save({ serviceId, documentType, content, fileExtension }); - const sha = await this._commit(filePath, message, fetchDate); - - if (!sha) { - return {}; - } - - return { - id: sha, - isFirstRecord, - }; - } - - finalize() { - if (!this.needsPublication) { - return; - } - - return this.git.pushChanges(); - } - - async getLatestRecord(serviceId, documentType) { - const filePathGlob = this._getPathFor(serviceId, documentType, '*'); - const { commit, filePath } = await this.git.findUnique(filePathGlob); - const recordFilePath = `${this.path}/${filePath}`; - - if (!commit || !filePath || !fsApi.existsSync(recordFilePath)) { - return {}; - } - - const mimeType = mime.getType(filePath); - const readFileOptions = {}; - - if (mimeType.startsWith('text/')) { - readFileOptions.encoding = 'utf8'; - } - - return { - id: commit.hash, - content: await fs.readFile(recordFilePath, readFileOptions), - mimeType, - fetchDate: new Date(commit.date), - isRefilter: commit.message.startsWith('Refilter'), - }; - } - - async* iterate() { - const initialCommitHash = (await this.git.raw([ 'rev-list', '--max-parents=0', 'HEAD' ])).trim(); - const currentBranchName = (await this.git.raw([ 'rev-parse', '--abbrev-ref', 'HEAD' ])).trim(); - - try { - let previousCommitHash; - - /* eslint-disable no-await-in-loop */ - while (previousCommitHash != initialCommitHash) { - const [{ hash, date, message, diff: { files: [{ file: relativeFilePath }] } }] = await this.git.log([ '-1', '--stat=4096', '--no-merges' ]); // get current commit information - - if (message.match(/^(Start tracking|Update|Refilter)/)) { // Skip commits which are not a document versions (initial README or LICENSE commits for example) - const absoluteFilePath = `${this.path}/${relativeFilePath}`; - - const serviceId = path.dirname(relativeFilePath); - const extension = path.extname(relativeFilePath); - const documentType = path.basename(relativeFilePath, extension); - - yield { - id: hash, - serviceId, - documentType, - content: await fs.readFile(absoluteFilePath, { encoding: 'utf8' }), - fetchDate: new Date(date), - }; - } - - previousCommitHash = hash; - - if (initialCommitHash != hash) { - await this.git.checkout(['HEAD^']); // checkout the parent commit - } - } - /* eslint-enable no-await-in-loop */ - } finally { - await this.git.checkout([currentBranchName]); - } - } - - async _save({ serviceId, documentType, content, fileExtension }) { - const directory = `${this.path}/${serviceId}`; - - if (!fsApi.existsSync(directory)) { - await fs.mkdir(directory, { recursive: true }); - } - - const filePath = this._getPathFor(serviceId, documentType, fileExtension); - - await fs.writeFile(filePath, content); - - return filePath; - } - - async _commit(filePath, message, authorDate) { - try { - await this.git.add(filePath); - - return await this.git.commit(filePath, message, authorDate); - } catch (error) { - throw new Error(`Could not commit ${filePath} with message "${message}" due to error: "${error}"`); - } - } - - _getPathFor(serviceId, documentType, fileExtension) { - return `${this.path}/${serviceId}/${documentType}.${fileExtension}`; - } - - _isTracked(serviceId, documentType) { - const filePath = this._getPathFor(serviceId, documentType, '*'); - - return this.git.isTracked(filePath); - } - - async _isFirstRecord(serviceId, documentType) { - return !await this._isTracked(serviceId, documentType); - } - - _generateCommitMessage({ serviceId, documentType, isRefilter, snapshotId, isFirstRecord }) { - let prefix = isRefilter ? 'Refilter' : 'Update'; - - prefix = isFirstRecord ? 'Start tracking' : prefix; - - let message = `${prefix} ${serviceId} ${documentType}`; - - if (snapshotId) { - message = `${message}\n\n${this.prefixMessageToSnapshotId}${snapshotId}`; - } - - return message; - } - - async _removeAllRecords() { - const files = await fs.readdir(this.path, { withFileTypes: true }); - const promises = files.map(file => { - const filePath = path.join(this.path, file.name); - - if (file.isDirectory()) { - return fs.rmdir(filePath, { recursive: true }); - } - - return fs.unlink(filePath); - }); - - await Promise.all(promises); - - return this.initialize(); - } -} diff --git a/src/storage-adapters/mongo/index.js b/src/storage-adapters/mongo/index.js deleted file mode 100644 index 46cf37e9f..000000000 --- a/src/storage-adapters/mongo/index.js +++ /dev/null @@ -1,100 +0,0 @@ -/** - * This file is the boundary beyond which the usage of MongoDB is abstracted. - * Object IDs are used as opaque unique IDs. - */ - -import { MongoClient, ObjectId } from 'mongodb'; - -const PDF_MIME_TYPE = 'application/pdf'; - -export default class MongoAdapter { - constructor({ database: databaseName, collection: collectionName, connectionURI }) { - const client = new MongoClient(connectionURI); - - this.databaseName = databaseName; - this.collectionName = collectionName; - this.client = client; - } - - async initialize() { - await this.client.connect(); - const db = this.client.db(this.databaseName); - - this.collection = db.collection(this.collectionName); - - return this; - } - - async finalize() { - return this.client.close(); - } - - async record({ serviceId, documentType, content: passedContent, mimeType, fetchDate, isRefilter, snapshotId }) { - let content = passedContent; - - if (mimeType == PDF_MIME_TYPE) { - content = passedContent.toString('utf-8'); // Serialize PDF - } - - const previousRecord = await this.getLatestRecord(serviceId, documentType); - - if (previousRecord && previousRecord.content == content) { - return {}; - } - - const recordProperties = Object.fromEntries(Object.entries({ - serviceId, - documentType, - content: passedContent, - mimeType, - fetchDate, - isRefilter, - snapshotId, - }).filter(([ , value ]) => value)); // Remove empty values - - const isFirstRecord = !await this.collection.findOne({ serviceId, documentType }); - - if (snapshotId) { - recordProperties.snapshotId = new ObjectId(snapshotId); - } - - if (isFirstRecord) { - recordProperties.isFirstRecord = isFirstRecord; - } - - const insertResult = await this.collection.insertOne({ ...recordProperties, created_at: new Date() }); - - return { - id: insertResult.insertedId.toString(), - isFirstRecord, - }; - } - - async getLatestRecord(serviceId, documentType) { - const [record] = await this.collection.find({ serviceId, documentType }).limit(1).sort({ fetchDate: -1 }).toArray(); - - if (!record) { - return {}; - } - - const { _id, content, mimeType, fetchDate, isRefilter } = record; - - return { - id: _id.toString(), - content, - mimeType, - fetchDate: new Date(fetchDate), - isRefilter: Boolean(isRefilter), - }; - } - - /* eslint-disable */ - async* iterate() { - throw new Error('#iterate is not yet implemented in the MongoDB storage adapter'); - } - /* eslint-enable */ - - async _removeAllRecords() { - return this.collection.deleteMany({}); - } -}