From 79451064e1a53779014ca6b964eef83dbfe42363 Mon Sep 17 00:00:00 2001 From: ikprk Date: Wed, 3 Jul 2024 08:39:05 +0200 Subject: [PATCH] CR fixes --- package-lock.json | 16 ------ package.json | 3 +- src/mappings/content/video.ts | 19 +------ src/utils/OrionVideoLanguageManager.ts | 4 +- .../customMigrations/setOrionLanguage.ts | 56 ------------------- .../setOrionLanguageProvider.ts | 44 ++++++++------- src/utils/language.ts | 31 +--------- 7 files changed, 33 insertions(+), 140 deletions(-) delete mode 100644 src/utils/customMigrations/setOrionLanguage.ts diff --git a/package-lock.json b/package-lock.json index 660052322..50059f6c4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -56,7 +56,6 @@ "patch-package": "^6.5.0", "pg": "8.8.0", "swagger-ui-express": "^4.6.2", - "tinyld": "^1.3.4", "type-graphql": "^1.2.0-rc.1", "typeorm": "^0.3.11", "ua-parser-js": "^1.0.34", @@ -25813,21 +25812,6 @@ "next-tick": "1" } }, - "node_modules/tinyld": { - "version": "1.3.4", - "resolved": "https://registry.npmjs.org/tinyld/-/tinyld-1.3.4.tgz", - "integrity": "sha512-u26CNoaInA4XpDU+8s/6Cq8xHc2T5M4fXB3ICfXPokUQoLzmPgSZU02TAkFwFMJCWTjk53gtkS8pETTreZwCqw==", - "bin": { - "tinyld": "bin/tinyld.js", - "tinyld-heavy": "bin/tinyld-heavy.js", - "tinyld-light": "bin/tinyld-light.js" - }, - "engines": { - "node": ">= 12.10.0", - "npm": ">= 6.12.0", - "yarn": ">= 1.20.0" - } - }, "node_modules/title-case": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/title-case/-/title-case-3.0.3.tgz", diff --git a/package.json b/package.json index fc40befab..33c3e5a96 100644 --- a/package.json +++ b/package.json @@ -88,7 +88,6 @@ "patch-package": "^6.5.0", "pg": "8.8.0", "swagger-ui-express": "^4.6.2", - "tinyld": "^1.3.4", "type-graphql": "^1.2.0-rc.1", "typeorm": "^0.3.11", "ua-parser-js": "^1.0.34", @@ -106,8 +105,8 @@ "@subsquid/substrate-typegen": "^2.1.0", "@subsquid/typeorm-codegen": "0.3.1", "@types/async-lock": "^1.1.3", - "@types/chai": "^4.3.11", "@types/big-json": "^3.2.4", + "@types/chai": "^4.3.11", "@types/cookie-parser": "^1.4.3", "@types/express-rate-limit": "^6.0.0", "@types/mocha": "^10.0.1", diff --git a/src/mappings/content/video.ts b/src/mappings/content/video.ts index d3bde7d7d..7249b2d42 100644 --- a/src/mappings/content/video.ts +++ b/src/mappings/content/video.ts @@ -16,10 +16,7 @@ import { VideoPosted, VideoViewEvent, } from '../../model' -import { VIDEO_ORION_LANGUAGE_CURSOR_NAME } from '../../utils/customMigrations/setOrionLanguageProvider' import { EventHandlerContext } from '../../utils/events' -import { predictVideoLanguage } from '../../utils/language' -import { OrionVideoLanguageManager } from '../../utils/OrionVideoLanguageManager' import { deserializeMetadata, genericEventFields, @@ -125,12 +122,7 @@ export async function processVideoCreatedEvent({ } } - video.orionLanguage = VIDEO_ORION_LANGUAGE_CURSOR_NAME - ? null - : predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) + video.orionLanguage = null channel.totalVideosCreated += 1 @@ -197,14 +189,7 @@ export async function processVideoUpdatedEvent({ ) } - if (VIDEO_ORION_LANGUAGE_CURSOR_NAME) { - orionVideoLanguageManager.scheduleVideoForDetection(video.id) - } else { - video.orionLanguage = predictVideoLanguage({ - title: video.title ?? '', - description: video.description ?? '', - }) - } + orionVideoLanguageManager.scheduleVideoForDetection(video.id) if (autoIssueNft) { await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft) diff --git a/src/utils/OrionVideoLanguageManager.ts b/src/utils/OrionVideoLanguageManager.ts index d177cb44c..5e059fb27 100644 --- a/src/utils/OrionVideoLanguageManager.ts +++ b/src/utils/OrionVideoLanguageManager.ts @@ -52,7 +52,9 @@ export class OrionVideoLanguageManager { private async updateLoop(intervalMs: number): Promise { const em = await globalEm while (true) { - await this.updateScheduledVideoLanguage(em) + await this.updateScheduledVideoLanguage(em).catch((e) => { + console.log(`Updating scheduled videos Orion language with provider failed`, e) + }) await this.updateOrionVideoLanguage().catch((e) => { console.log(`Updating Orion language with provider failed`, e) }) diff --git a/src/utils/customMigrations/setOrionLanguage.ts b/src/utils/customMigrations/setOrionLanguage.ts deleted file mode 100644 index 79299fe73..000000000 --- a/src/utils/customMigrations/setOrionLanguage.ts +++ /dev/null @@ -1,56 +0,0 @@ -import { EntityManager } from 'typeorm' -import { globalEm } from '../globalEm' -import { predictVideoLanguage } from '../language' - -async function detectVideoLanguage() { - const em: EntityManager = await globalEm - const videos: any[] = await em.query(` - SELECT id, title, description - FROM admin.video - `) - - // Temporary storage for batch update data - const updates: any[] = [] - - for (const [i, video] of videos.entries()) { - const orionLanguage = predictVideoLanguage({ - title: video.title, - description: video.description, - }) - - // Instead of updating immediately, push the update data into the array - updates.push({ orionLanguage, id: video.id }) - console.log(i) - } - - // Define batch size - const batchSize = 1000 // Adjust the batch size based on your database and network performance - - for (let i = 0; i < updates.length; i += batchSize) { - const batch = updates.slice(i, i + batchSize) - - // Prepare the query and parameters for batch update - const query = ` - UPDATE admin.video AS v SET - orion_language = c.orion_language - FROM (VALUES ${batch - .map((_, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`) - .join(',')}) AS c(orion_language, id) - WHERE c.id = v.id; - ` - - const queryParams = batch.flatMap((update) => [update.orionLanguage, update.id]) - - // Execute batch update - await em.query(query, queryParams) - } - - console.log(`Updated languages for ${videos.length} videos`) -} - -detectVideoLanguage() - .then(() => console.log('Update process completed.')) - .catch(() => { - console.error('process failed') - process.exit(1) - }) diff --git a/src/utils/customMigrations/setOrionLanguageProvider.ts b/src/utils/customMigrations/setOrionLanguageProvider.ts index 94f2b959e..0ec55a8a7 100644 --- a/src/utils/customMigrations/setOrionLanguageProvider.ts +++ b/src/utils/customMigrations/setOrionLanguageProvider.ts @@ -1,7 +1,7 @@ import { EntityManager } from 'typeorm' import { OrionOffchainCursor } from '../../model' import { globalEm } from '../globalEm' -import { predictLanguageForArray } from '../language' +import { predictLanguageWithProvider } from '../language' const batchSize = 5_000 // Adjust the batch size based on your database and network performance @@ -16,7 +16,7 @@ export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language' export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdateType[]) { const mappedVideos = videos.map((video) => `${video.title} ${video.description}`) - const predictionForVideos = await predictLanguageForArray(mappedVideos) + const predictionForVideos = await predictLanguageWithProvider(mappedVideos) const videosWithDetections = videos.map((video, index) => ({ ...video, @@ -40,32 +40,36 @@ export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdat export async function detectVideoLanguageWithProvider() { const em: EntityManager = await globalEm - const cursorEntity: { value: string }[] = await em.query( + let cursorEntity: { value: number }[] = await em.query( `SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'` ) - const cursor = +(cursorEntity[0]?.value ?? 0) + while (true) { + const cursor = +(cursorEntity[0]?.value ?? 0) - const videos: VideoUpdateType[] = await em.query(` + const videos: VideoUpdateType[] = await em.query(` SELECT id, title, description FROM admin.video ORDER BY id::INTEGER ASC OFFSET ${cursor} LIMIT ${batchSize} - `) + `) - if (!videos.length) { - console.log('No more videos!') - return - } - await updateVideoLanguages(em, videos) - const newCursor = new OrionOffchainCursor({ - cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, - value: cursor + Math.min(batchSize, videos.length), - }) - await em.save(newCursor) - console.log( - `Updated languages for videos in range ${cursor}-${cursor + Math.min(batchSize, videos.length)}` - ) + if (!videos.length) { + console.log('No more videos!') + break + } - await detectVideoLanguageWithProvider() + await updateVideoLanguages(em, videos) + const newCursor = new OrionOffchainCursor({ + cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME, + value: cursor + Math.min(batchSize, videos.length), + }) + await em.save(newCursor) + cursorEntity = [newCursor] + console.log( + `Updated languages for videos in range ${cursor}-${ + cursor + Math.min(batchSize, videos.length) + }` + ) + } } diff --git a/src/utils/language.ts b/src/utils/language.ts index 4b74978ef..1846766c6 100644 --- a/src/utils/language.ts +++ b/src/utils/language.ts @@ -1,4 +1,3 @@ -import { detectAll } from 'tinyld' import DetectLanguage from 'detectlanguage' const languageDetectionApiKey = process.env.DETECTLANGUAGE_API_KEY @@ -15,32 +14,8 @@ function cleanString(input: string): string { return cleanedString } -function predictLanguage(text: string): { lang: string; accuracy: number } | undefined { - const cleanedText = cleanString(text) - - // Get the most accurate language prediction - return detectAll(cleanedText)?.[0] -} - -export async function predictLanguageForArray(texts: string[]) { - const result = await languageDetectionInstace.detect(texts) +export async function predictLanguageWithProvider(texts: string[]) { + const cleanedTexts = texts.map(cleanString) + const result = await languageDetectionInstace.detect(cleanedTexts) return result.map((row) => row[0].language) } - -export function predictVideoLanguage({ title, description }: any): string | undefined { - let detectedLang: string | undefined - - const titleLang = predictLanguage(title ?? '') - - detectedLang = titleLang?.lang - - if ((titleLang?.accuracy || 0) < 0.5) { - const titleAndDescriptionLang = predictLanguage(`${title} ${description}`) - if ((titleAndDescriptionLang?.accuracy || 0) > (titleLang?.accuracy || 0)) { - // then - detectedLang = titleAndDescriptionLang?.lang - } - } - - return detectedLang -}