From 6167139dfaa1a6eb7c586043eb991842e6cfe135 Mon Sep 17 00:00:00 2001 From: ikprk <168457495+ikprk@users.noreply.github.com> Date: Wed, 10 Jul 2024 18:32:45 +0200 Subject: [PATCH] Better orion language detection (#337) * New model to track orion language processing * New function to detect language * New custom migration * Add cursor tracker and video orion language to offchain export * Running `custom-migration` command * Add manger to trigger video language updates * Adjust orion video language manager to support video update * Revert "Running `custom-migration` command" This reverts commit 71c4997aa17fbd6b4952ee4c7b935a1c7ce26e02. * Second run of `create-migrations` command * CR fixes * move 'orion_offchain_cursor' table to admin schema * bump package version and add change log * fix: bug in case detected language is undefined --------- Co-authored-by: Zeeshan Akram <97m.zeeshan@gmail.com> --- .env | 3 + CHANGELOG.md | 5 ++ db/migrations/1720623003671-Data.js | 11 +++ ...962433-Views.js => 1720623003800-Views.js} | 4 +- package-lock.json | 37 ++++----- package.json | 6 +- src/mappings/content/video.ts | 12 +-- src/mappings/utils.ts | 6 ++ src/model/OrionOffchainCursor.ts | 20 +++++ src/model/index.ts | 1 + src/utils/OrionVideoLanguageManager.ts | 64 ++++++++++++++++ .../customMigrations/setOrionLanguage.ts | 56 -------------- .../setOrionLanguageProvider.ts | 75 +++++++++++++++++++ src/utils/language.ts | 33 +++----- src/utils/offchainState.ts | 3 +- 15 files changed, 223 insertions(+), 113 deletions(-) create mode 100644 db/migrations/1720623003671-Data.js rename db/migrations/{1709641962433-Views.js => 1720623003800-Views.js} (91%) create mode 100644 src/model/OrionOffchainCursor.ts create mode 100644 src/utils/OrionVideoLanguageManager.ts delete mode 100644 src/utils/customMigrations/setOrionLanguage.ts create mode 100644 src/utils/customMigrations/setOrionLanguageProvider.ts diff --git a/.env b/.env index 3cc110bab..c916afc28 100644 --- a/.env +++ b/.env @@ -62,6 +62,9 @@ TRUST_PROXY=uniquelocal SENDGRID_API_KEY= SENDGRID_FROM_EMAIL=gateway@example.com +# Detectlanguage +DETECTLANGUAGE_API_KEY= + # Debug settings SQD_DEBUG=api:* diff --git a/CHANGELOG.md b/CHANGELOG.md index 598ddafa1..9ae4ec682 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +# 4.0.4 + +## Bug Fixes: +- Fixed: improve the accuracy of `Video.orionLanguage` field by reworking the `predictVideoLanguage` function in `src/utils/language.ts` + # 4.0.3 ## Misc diff --git a/db/migrations/1720623003671-Data.js b/db/migrations/1720623003671-Data.js new file mode 100644 index 000000000..5a14f9337 --- /dev/null +++ b/db/migrations/1720623003671-Data.js @@ -0,0 +1,11 @@ +module.exports = class Data1720623003671 { + name = 'Data1720623003671' + + async up(db) { + await db.query(`CREATE TABLE "admin"."orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`) + } + + async down(db) { + await db.query(`DROP TABLE "admin"."orion_offchain_cursor"`) + } +} diff --git a/db/migrations/1709641962433-Views.js b/db/migrations/1720623003800-Views.js similarity index 91% rename from db/migrations/1709641962433-Views.js rename to db/migrations/1720623003800-Views.js index 247ab4dca..b138499f6 100644 --- a/db/migrations/1709641962433-Views.js +++ b/db/migrations/1720623003800-Views.js @@ -1,8 +1,8 @@ const { getViewDefinitions } = require('../viewDefinitions') -module.exports = class Views1709641962433 { - name = 'Views1709641962433' +module.exports = class Views1720623003800 { + name = 'Views1720623003800' async up(db) { const viewDefinitions = getViewDefinitions(db); diff --git a/package-lock.json b/package-lock.json index d4b2b482c..a0650bcdd 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,12 +1,12 @@ { "name": "orion", - "version": "4.0.3", + "version": "4.0.4", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "orion", - "version": "4.0.3", + "version": "4.0.4", "hasInstallScript": true, "workspaces": [ "network-tests" @@ -40,6 +40,7 @@ "cookie-parser": "^1.4.6", "csv-stringify": "^6.3.0", "dayjs": "^1.11.7", + "detectlanguage": "^2.1.0", "dotenv": "^16.0.3", "dotenv-expand": "^10.0.0", "express-openapi-validator": "^5.0.3", @@ -55,7 +56,6 @@ "patch-package": "^6.5.0", "pg": "8.8.0", "swagger-ui-express": "^4.6.2", - "tinyld": "^1.3.4", "type-graphql": "^1.2.0-rc.1", "typeorm": "^0.3.11", "ua-parser-js": "^1.0.34", @@ -13889,6 +13889,22 @@ "integrity": "sha512-T0NIuQpnTvFDATNuHN5roPwSBG83rFsuO+MXXH9/3N1eFbn4wcPjttvjMLEPWJ0RGUYgQE7cGgS3tNxbqCGM7g==", "dev": true }, + "node_modules/detectlanguage": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/detectlanguage/-/detectlanguage-2.1.0.tgz", + "integrity": "sha512-EbLGyZxiQJeur5a+GNOzBV9xL/r/7GfvRALSHAKepw38UAvCssn7obVvhsioRIV+uDj3IQtXzL7iNkwu0oCp7g==", + "dependencies": { + "axios": "^0.21.1" + } + }, + "node_modules/detectlanguage/node_modules/axios": { + "version": "0.21.4", + "resolved": "https://registry.npmjs.org/axios/-/axios-0.21.4.tgz", + "integrity": "sha512-ut5vewkiu8jjGBdqpM44XxjuCjq9LAKeHVmoVfHVzy8eHgxxq8SbAVQNovDA8mVi05kP0Ea/n/UzcSHcTJQfNg==", + "dependencies": { + "follow-redirects": "^1.14.0" + } + }, "node_modules/dezalgo": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/dezalgo/-/dezalgo-1.0.4.tgz", @@ -25796,21 +25812,6 @@ "next-tick": "1" } }, - "node_modules/tinyld": { - "version": "1.3.4", - "resolved": "https://registry.npmjs.org/tinyld/-/tinyld-1.3.4.tgz", - "integrity": "sha512-u26CNoaInA4XpDU+8s/6Cq8xHc2T5M4fXB3ICfXPokUQoLzmPgSZU02TAkFwFMJCWTjk53gtkS8pETTreZwCqw==", - "bin": { - "tinyld": "bin/tinyld.js", - "tinyld-heavy": "bin/tinyld-heavy.js", - "tinyld-light": "bin/tinyld-light.js" - }, - "engines": { - "node": ">= 12.10.0", - "npm": ">= 6.12.0", - "yarn": ">= 1.20.0" - } - }, "node_modules/title-case": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/title-case/-/title-case-3.0.3.tgz", diff --git a/package.json b/package.json index 61462ea6c..f2ec73eda 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "orion", - "version": "4.0.3", + "version": "4.0.4", "engines": { "node": ">=16" }, @@ -72,6 +72,7 @@ "cookie-parser": "^1.4.6", "csv-stringify": "^6.3.0", "dayjs": "^1.11.7", + "detectlanguage": "^2.1.0", "dotenv": "^16.0.3", "dotenv-expand": "^10.0.0", "express-openapi-validator": "^5.0.3", @@ -87,7 +88,6 @@ "patch-package": "^6.5.0", "pg": "8.8.0", "swagger-ui-express": "^4.6.2", - "tinyld": "^1.3.4", "type-graphql": "^1.2.0-rc.1", "typeorm": "^0.3.11", "ua-parser-js": "^1.0.34", @@ -105,8 +105,8 @@ "@subsquid/substrate-typegen": "^2.1.0", "@subsquid/typeorm-codegen": "0.3.1", "@types/async-lock": "^1.1.3", - "@types/chai": "^4.3.11", "@types/big-json": "^3.2.4", + "@types/chai": "^4.3.11", "@types/cookie-parser": "^1.4.3", "@types/express-rate-limit": "^6.0.0", "@types/mocha": "^10.0.1", diff --git a/src/mappings/content/video.ts b/src/mappings/content/video.ts index 8589f202e..7249b2d42 100644 --- a/src/mappings/content/video.ts +++ b/src/mappings/content/video.ts @@ -17,10 +17,10 @@ import { VideoViewEvent, } from '../../model' import { EventHandlerContext } from '../../utils/events' -import { predictVideoLanguage } from '../../utils/language' import { deserializeMetadata, genericEventFields, + orionVideoLanguageManager, u8aToBytes, videoRelevanceManager, } from '../utils' @@ -122,10 +122,7 @@ export async function processVideoCreatedEvent({ } } - video.orionLanguage = predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) + video.orionLanguage = null channel.totalVideosCreated += 1 @@ -192,10 +189,7 @@ export async function processVideoUpdatedEvent({ ) } - video.orionLanguage = predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) + orionVideoLanguageManager.scheduleVideoForDetection(video.id) if (autoIssueNft) { await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft) diff --git a/src/mappings/utils.ts b/src/mappings/utils.ts index 49d1f8877..4f2da75f3 100644 --- a/src/mappings/utils.ts +++ b/src/mappings/utils.ts @@ -10,10 +10,16 @@ import { Event, MetaprotocolTransactionResultFailed, NftActivity, NftHistoryEntr import { CommentCountersManager } from '../utils/CommentsCountersManager' import { VideoRelevanceManager } from '../utils/VideoRelevanceManager' import { EntityManagerOverlay } from '../utils/overlay' +import { OrionVideoLanguageManager } from '../utils/OrionVideoLanguageManager' +export const orionVideoLanguageManager = new OrionVideoLanguageManager() export const commentCountersManager = new CommentCountersManager() export const videoRelevanceManager = new VideoRelevanceManager() // eslint-disable-next-line no-void +void orionVideoLanguageManager.init( + 1000 * 60 * 5 // 5 mins +) +// eslint-disable-next-line no-void void videoRelevanceManager.init({ fullUpdateLoopTime: 1000 * 60 * 60 * 12, // 12 hrs scheduledUpdateLoopTime: 1000 * 60 * 10, // 10 mins diff --git a/src/model/OrionOffchainCursor.ts b/src/model/OrionOffchainCursor.ts new file mode 100644 index 000000000..92854164b --- /dev/null +++ b/src/model/OrionOffchainCursor.ts @@ -0,0 +1,20 @@ +import { Column, Entity, PrimaryColumn } from 'typeorm' + +@Entity({ schema: 'admin' }) +export class OrionOffchainCursor { + constructor(props?: Partial) { + Object.assign(this, props) + } + + /** + * Name of the offchain cursor + */ + @PrimaryColumn() + cursorName!: string + + /** + * Value of the cursor + */ + @Column('int8', { nullable: false }) + value!: number +} diff --git a/src/model/index.ts b/src/model/index.ts index 7ebc7a5a7..21c96308b 100644 --- a/src/model/index.ts +++ b/src/model/index.ts @@ -1,2 +1,3 @@ export * from './generated' export { NextEntityId } from './NextEntityId' +export { OrionOffchainCursor } from './OrionOffchainCursor' diff --git a/src/utils/OrionVideoLanguageManager.ts b/src/utils/OrionVideoLanguageManager.ts new file mode 100644 index 000000000..5e059fb27 --- /dev/null +++ b/src/utils/OrionVideoLanguageManager.ts @@ -0,0 +1,64 @@ +import { EntityManager } from 'typeorm' +import { + detectVideoLanguageWithProvider, + updateVideoLanguages, + VIDEO_ORION_LANGUAGE_CURSOR_NAME, +} from './customMigrations/setOrionLanguageProvider' +import { globalEm } from './globalEm' + +export class OrionVideoLanguageManager { + private videoToDetect: Set = new Set() + + async init(intervalMs: number): Promise { + if (!VIDEO_ORION_LANGUAGE_CURSOR_NAME) { + return + } + + this.updateLoop(intervalMs) + .then(() => { + /* Do nothing */ + }) + .catch((err) => { + console.error(err) + process.exit(-1) + }) + } + + scheduleVideoForDetection(id: string | null | undefined) { + if (id) { + this.videoToDetect.add(id) + } + } + + async updateScheduledVideoLanguage(em: EntityManager) { + if (!this.videoToDetect.size) { + return + } + + const videos = await em.query(` + SELECT id, title, description + FROM admin.video + WHERE id in (${[...this.videoToDetect.values()].map((id) => `'${id}'`).join(',')}) + `) + + await updateVideoLanguages(em, videos) + this.videoToDetect.clear() + } + + async updateOrionVideoLanguage() { + return detectVideoLanguageWithProvider() + } + + private async updateLoop(intervalMs: number): Promise { + const em = await globalEm + while (true) { + await this.updateScheduledVideoLanguage(em).catch((e) => { + console.log(`Updating scheduled videos Orion language with provider failed`, e) + }) + await this.updateOrionVideoLanguage().catch((e) => { + console.log(`Updating Orion language with provider failed`, e) + }) + await new Promise((resolve) => setTimeout(resolve, intervalMs)) + } + } +} diff --git a/src/utils/customMigrations/setOrionLanguage.ts b/src/utils/customMigrations/setOrionLanguage.ts deleted file mode 100644 index 79299fe73..000000000 --- a/src/utils/customMigrations/setOrionLanguage.ts +++ /dev/null @@ -1,56 +0,0 @@ -import { EntityManager } from 'typeorm' -import { globalEm } from '../globalEm' -import { predictVideoLanguage } from '../language' - -async function detectVideoLanguage() { - const em: EntityManager = await globalEm - const videos: any[] = await em.query(` - SELECT id, title, description - FROM admin.video - `) - - // Temporary storage for batch update data - const updates: any[] = [] - - for (const [i, video] of videos.entries()) { - const orionLanguage = predictVideoLanguage({ - title: video.title, - description: video.description, - }) - - // Instead of updating immediately, push the update data into the array - updates.push({ orionLanguage, id: video.id }) - console.log(i) - } - - // Define batch size - const batchSize = 1000 // Adjust the batch size based on your database and network performance - - for (let i = 0; i < updates.length; i += batchSize) { - const batch = updates.slice(i, i + batchSize) - - // Prepare the query and parameters for batch update - const query = ` - UPDATE admin.video AS v SET - orion_language = c.orion_language - FROM (VALUES ${batch - .map((_, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`) - .join(',')}) AS c(orion_language, id) - WHERE c.id = v.id; - ` - - const queryParams = batch.flatMap((update) => [update.orionLanguage, update.id]) - - // Execute batch update - await em.query(query, queryParams) - } - - console.log(`Updated languages for ${videos.length} videos`) -} - -detectVideoLanguage() - .then(() => console.log('Update process completed.')) - .catch(() => { - console.error('process failed') - process.exit(1) - }) diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts new file mode 100644 index 000000000..e80f2b0e7 --- /dev/null +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -0,0 +1,75 @@ +import { EntityManager } from 'typeorm' +import { OrionOffchainCursor } from '../../model' +import { globalEm } from '../globalEm' +import { predictLanguageWithProvider } from '../language' + +const batchSize = 5_000 // Adjust the batch size based on your database and network performance + +type VideoUpdateType = { + id: string + title: string + description: string +} + +export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' + +export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdateType[]) { + const mappedVideos = videos.map((video) => `${video.title} ${video.description}`) + + const predictionForVideos = await predictLanguageWithProvider(mappedVideos) + + const videosWithDetections = videos.map((video, index) => ({ + ...video, + detectedLanguage: predictionForVideos[index], + })) + + const query = ` + UPDATE admin.video AS v SET + orion_language = c.orion_language + FROM (VALUES ${videosWithDetections + .map((_, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`) + .join(',')}) AS c(orion_language, id) + WHERE c.id = v.id; + ` + + const queryParams = videosWithDetections.flatMap((update) => [update.detectedLanguage, update.id]) + + // Execute batch update + await em.query(query, queryParams) +} + +export async function detectVideoLanguageWithProvider() { + const em: EntityManager = await globalEm + let cursorEntity: { value: number }[] = await em.query( + `SELECT value FROM admin.orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` + ) + while (true) { + const cursor = +(cursorEntity[0]?.value ?? 0) + + const videos: VideoUpdateType[] = await em.query(` + SELECT id, title, description + FROM admin.video + ORDER BY id::INTEGER ASC + OFFSET ${cursor} + LIMIT ${batchSize} + `) + + if (!videos.length) { + console.log('No more videos!') + break + } + + await updateVideoLanguages(em, videos) + const newCursor = new OrionOffchainCursor({ + cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, + value: cursor + Math.min(batchSize, videos.length), + }) + await em.save(newCursor) + cursorEntity = [newCursor] + console.log( + `Updated languages for videos in range ${cursor}-${ + cursor + Math.min(batchSize, videos.length) + }` + ) + } +} diff --git a/src/utils/language.ts b/src/utils/language.ts index a9f8c2b45..d482ac1f8 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,4 +1,8 @@ -import { detectAll } from 'tinyld' +import DetectLanguage from 'detectlanguage' + +const languageDetectionApiKey = process.env.DETECTLANGUAGE_API_KEY + +const languageDetectionInstace = new DetectLanguage(languageDetectionApiKey ?? '') function cleanString(input: string): string { // First, remove URLs. This pattern targets a broad range of URLs. @@ -10,27 +14,8 @@ function cleanString(input: string): string { return cleanedString } -function predictLanguage(text: string): { lang: string; accuracy: number } | undefined { - const cleanedText = cleanString(text) - - // Get the most accurate language prediction - return detectAll(cleanedText)?.[0] -} - -export function predictVideoLanguage({ title, description }: any): string | undefined { - let detectedLang: string | undefined - - const titleLang = predictLanguage(title ?? '') - - detectedLang = titleLang?.lang - - if ((titleLang?.accuracy || 0) < 0.5) { - const titleAndDescriptionLang = predictLanguage(`${title} ${description}`) - if ((titleAndDescriptionLang?.accuracy || 0) > (titleLang?.accuracy || 0)) { - // then - detectedLang = titleAndDescriptionLang?.lang - } - } - - return detectedLang +export async function predictLanguageWithProvider(texts: string[]) { + const cleanedTexts = texts.map(cleanString) + const result = await languageDetectionInstace.detect(cleanedTexts) + return result.map((row) => row[0]?.language) } diff --git a/src/utils/offchainState.ts b/src/utils/offchainState.ts index 7360ed2c3..353e19322 100644 --- a/src/utils/offchainState.ts +++ b/src/utils/offchainState.ts @@ -60,8 +60,9 @@ const exportedStateMap: ExportedStateMap = { EmailDeliveryAttempt: true, Token: true, NextEntityId: true, + OrionOffchainCursor: true, Channel: ['isExcluded', 'videoViewsNum', 'followsNum', 'yppStatus', 'channelWeight'], - Video: ['isExcluded', 'viewsNum'], + Video: ['isExcluded', 'viewsNum', 'orionLanguage'], Comment: ['isExcluded'], OwnedNft: ['isFeatured'], VideoCategory: ['isSupported'],