Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better orion language detection #337

Merged
3 changes: 3 additions & 0 deletions .env
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ TRUST_PROXY=uniquelocal
SENDGRID_API_KEY=
SENDGRID_FROM_EMAIL=gateway@example.com

# Detectlanguage
DETECTLANGUAGE_API_KEY=

# Debug settings
SQD_DEBUG=api:*

Expand Down
11 changes: 11 additions & 0 deletions db/migrations/1719233585592-Data.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
module.exports = class Data1719233585592 {
name = 'Data1719233585592'

async up(db) {
await db.query(`CREATE TABLE "orion_offchain_cursor" ("cursor_name" character varying NOT NULL, "value" bigint NOT NULL, CONSTRAINT "PK_7083797352af5a21224b6c8ccbc" PRIMARY KEY ("cursor_name"))`)
}

async down(db) {
await db.query(`DROP TABLE "orion_offchain_cursor"`)
}
}
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@

const { getViewDefinitions } = require('../viewDefinitions')

module.exports = class Views1709641962433 {
name = 'Views1709641962433'
module.exports = class Views1719233585692 {
name = 'Views1719233585692'

async up(db) {
const viewDefinitions = getViewDefinitions(db);
Expand Down
33 changes: 17 additions & 16 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"cookie-parser": "^1.4.6",
"csv-stringify": "^6.3.0",
"dayjs": "^1.11.7",
"detectlanguage": "^2.1.0",
"dotenv": "^16.0.3",
"dotenv-expand": "^10.0.0",
"express-openapi-validator": "^5.0.3",
Expand All @@ -87,7 +88,6 @@
"patch-package": "^6.5.0",
"pg": "8.8.0",
"swagger-ui-express": "^4.6.2",
"tinyld": "^1.3.4",
"type-graphql": "^1.2.0-rc.1",
"typeorm": "^0.3.11",
"ua-parser-js": "^1.0.34",
Expand All @@ -105,8 +105,8 @@
"@subsquid/substrate-typegen": "^2.1.0",
"@subsquid/typeorm-codegen": "0.3.1",
"@types/async-lock": "^1.1.3",
"@types/chai": "^4.3.11",
"@types/big-json": "^3.2.4",
"@types/chai": "^4.3.11",
"@types/cookie-parser": "^1.4.3",
"@types/express-rate-limit": "^6.0.0",
"@types/mocha": "^10.0.1",
Expand Down
12 changes: 3 additions & 9 deletions src/mappings/content/video.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ import {
VideoViewEvent,
} from '../../model'
import { EventHandlerContext } from '../../utils/events'
import { predictVideoLanguage } from '../../utils/language'
import {
deserializeMetadata,
genericEventFields,
orionVideoLanguageManager,
u8aToBytes,
videoRelevanceManager,
} from '../utils'
Expand Down Expand Up @@ -122,10 +122,7 @@ export async function processVideoCreatedEvent({
}
}

video.orionLanguage = predictVideoLanguage({
title: video.title ?? '',
description: video.description ?? '',
})
video.orionLanguage = null

channel.totalVideosCreated += 1

Expand Down Expand Up @@ -192,10 +189,7 @@ export async function processVideoUpdatedEvent({
)
}

video.orionLanguage = predictVideoLanguage({
title: video.title ?? '',
description: video.description ?? '',
})
orionVideoLanguageManager.scheduleVideoForDetection(video.id)

if (autoIssueNft) {
await processNft(overlay, block, indexInBlock, extrinsicHash, video, contentActor, autoIssueNft)
Expand Down
6 changes: 6 additions & 0 deletions src/mappings/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,16 @@ import { Event, MetaprotocolTransactionResultFailed, NftActivity, NftHistoryEntr
import { CommentCountersManager } from '../utils/CommentsCountersManager'
import { VideoRelevanceManager } from '../utils/VideoRelevanceManager'
import { EntityManagerOverlay } from '../utils/overlay'
import { OrionVideoLanguageManager } from '../utils/OrionVideoLanguageManager'

export const orionVideoLanguageManager = new OrionVideoLanguageManager()
export const commentCountersManager = new CommentCountersManager()
export const videoRelevanceManager = new VideoRelevanceManager()
// eslint-disable-next-line no-void
void orionVideoLanguageManager.init(
1000 * 60 * 5 // 5 mins
)
// eslint-disable-next-line no-void
void videoRelevanceManager.init({
fullUpdateLoopTime: 1000 * 60 * 60 * 12, // 12 hrs
scheduledUpdateLoopTime: 1000 * 60 * 10, // 10 mins
Expand Down
20 changes: 20 additions & 0 deletions src/model/OrionOffchainCursor.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import { Entity, Column, PrimaryColumn } from 'typeorm'

@Entity()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to create this in admin schema

Suggested change
@Entity()
@Entity({ schema: 'admin' })

export class OrionOffchainCursor {
constructor(props?: Partial<OrionOffchainCursor>) {
Object.assign(this, props)
}

/**
* Name of the offchain cursor
*/
@PrimaryColumn()
cursorName!: string

/**
* Value of the cursor
*/
@Column('int8', { nullable: false })
value!: number
}
1 change: 1 addition & 0 deletions src/model/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
export * from './generated'
export { NextEntityId } from './NextEntityId'
export { OrionOffchainCursor } from './OrionOffchainCursor'
64 changes: 64 additions & 0 deletions src/utils/OrionVideoLanguageManager.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import { EntityManager } from 'typeorm'
import {
detectVideoLanguageWithProvider,
updateVideoLanguages,
VIDEO_ORION_LANGUAGE_CURSOR_NAME,
} from './customMigrations/setOrionLanguageProvider'
import { globalEm } from './globalEm'

export class OrionVideoLanguageManager {
private videoToDetect: Set<string> = new Set()

async init(intervalMs: number): Promise<void> {
if (!VIDEO_ORION_LANGUAGE_CURSOR_NAME) {
return
}

this.updateLoop(intervalMs)
.then(() => {
/* Do nothing */
})
.catch((err) => {
console.error(err)
process.exit(-1)
})
}

scheduleVideoForDetection(id: string | null | undefined) {
if (id) {
this.videoToDetect.add(id)
}
}

async updateScheduledVideoLanguage(em: EntityManager) {
if (!this.videoToDetect.size) {
return
}

const videos = await em.query(`
SELECT id, title, description
FROM admin.video
WHERE id in (${[...this.videoToDetect.values()].map((id) => `'${id}'`).join(',')})
`)

await updateVideoLanguages(em, videos)
this.videoToDetect.clear()
}

async updateOrionVideoLanguage() {
return detectVideoLanguageWithProvider()
}

private async updateLoop(intervalMs: number): Promise<void> {
const em = await globalEm
while (true) {
await this.updateScheduledVideoLanguage(em).catch((e) => {
console.log(`Updating scheduled videos Orion language with provider failed`, e)
})
await this.updateOrionVideoLanguage().catch((e) => {
console.log(`Updating Orion language with provider failed`, e)
})
await new Promise((resolve) => setTimeout(resolve, intervalMs))
}
}
}
56 changes: 0 additions & 56 deletions src/utils/customMigrations/setOrionLanguage.ts

This file was deleted.

75 changes: 75 additions & 0 deletions src/utils/customMigrations/setOrionLanguageProvider.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import { EntityManager } from 'typeorm'
import { OrionOffchainCursor } from '../../model'
import { globalEm } from '../globalEm'
import { predictLanguageWithProvider } from '../language'

const batchSize = 5_000 // Adjust the batch size based on your database and network performance

type VideoUpdateType = {
id: string
title: string
description: string
}

export const VIDEO_ORION_LANGUAGE_CURSOR_NAME = 'video_orion_language'

export async function updateVideoLanguages(em: EntityManager, videos: VideoUpdateType[]) {
const mappedVideos = videos.map((video) => `${video.title} ${video.description}`)

const predictionForVideos = await predictLanguageWithProvider(mappedVideos)

const videosWithDetections = videos.map((video, index) => ({
...video,
detectedLanguage: predictionForVideos[index],
}))

const query = `
UPDATE admin.video AS v SET
orion_language = c.orion_language
FROM (VALUES ${videosWithDetections
.map((_, idx) => `($${idx * 2 + 1}, $${idx * 2 + 2})`)
.join(',')}) AS c(orion_language, id)
WHERE c.id = v.id;
`

const queryParams = videosWithDetections.flatMap((update) => [update.detectedLanguage, update.id])

// Execute batch update
await em.query(query, queryParams)
}

export async function detectVideoLanguageWithProvider() {
const em: EntityManager = await globalEm
let cursorEntity: { value: number }[] = await em.query(
`SELECT value FROM orion_offchain_cursor WHERE cursor_name='${VIDEO_ORION_LANGUAGE_CURSOR_NAME}'`
)
while (true) {
const cursor = +(cursorEntity[0]?.value ?? 0)

const videos: VideoUpdateType[] = await em.query(`
SELECT id, title, description
FROM admin.video
ORDER BY id::INTEGER ASC
OFFSET ${cursor}
LIMIT ${batchSize}
`)

if (!videos.length) {
console.log('No more videos!')
break
}

await updateVideoLanguages(em, videos)
const newCursor = new OrionOffchainCursor({
cursorName: VIDEO_ORION_LANGUAGE_CURSOR_NAME,
value: cursor + Math.min(batchSize, videos.length),
})
await em.save(newCursor)
cursorEntity = [newCursor]
console.log(
`Updated languages for videos in range ${cursor}-${
cursor + Math.min(batchSize, videos.length)
}`
)
}
}
Loading
Loading