feat(transcription): groundwork

chore: fiddling around some more

chore: add ctranslate2 and timestamped

chore: add performance markers

chore: refactor test

chore: change worflow name

chore: ensure Python3

chore(duration): convert to chai/mocha syntahx

chore(transcription): add individual tests for others transcribers

chore(transcription): implement formats test of all implementations

Also compare result of other implementation to the reference implementation

chore(transcription): add more test case with other language and models size and local model

chore(test): wip ctranslate 2 adapat

chore(transcription): wip transcript file and benchmark

chore(test): clean a bit

chore(test): clean a bit

chore(test): refacto timestamed spec

chore(test): update workflow

chore(test): fix glob expansion with sh

chore(test): extract some hw info

chore(test): fix async tests

chore(benchmark): add model info

feat(transcription): allow use of a local mode in timestamped-whisper

feat(transcription): extract run and profiling info in own value object

feat(transcription): extract run concept in own class an run more bench

chore(transcription): somplify run object only a uuid is now needed and add more benchmark scenario

docs(transcription): creates own package readme

docs(transcription): add local model usage

docs(transcription): update README

fix(transcription): use fr video for better comparison

chore(transcription): make openai comparison passed

docs(timestamped): clea

chore(transcription): change transcribers transcribe method signature

Introduce whisper builtin model.

fix(transcription): activate language detection

Forbid transcript creation without a language.
Add `languageDetection` flag to an engine and some assertions.

Fix an issue in `whisper-ctranslate2` :
https://github.com/Softcatala/whisper-ctranslate2/pull/93

chore(transcription): use PeerTube time helpers instead of custom ones

Update existing time function to output an integer number of seconds and add a ms human-readable time formatter with hints of tests.

chore(transcription): use PeerTube UUID helpers

chore(transcription): enable CER evaluation

Thanks to this recent fix in Jiwer <3
https://github.com/jitsi/jiwer/issues/873

chore(jiwer): creates JiWer package

I'm not very happy with the TranscriptFileEvaluator constructor... suggestions ?

chore(JiWer): add usage in README

docs(jiwer): update JiWer readme

chore(transcription): use FunMOOC video in fixtures

chore(transcription): add proper english video fixture

chore(transcription): use os tmp directory where relevant

chore(transcription): fix jiwer cli test reference.txt

chore(transcription): move benchmark out of tests

chore(transcription): remove transcription workflow

docs(transcription): add benchmark info

fix(transcription): use ms precision in other transcribers

chore(transcription): simplify most of the tests

chore(transcription): remove slashes when building path with join

chore(transcription): make fromPath method async

chore(transcription): assert path to model is a directory for CTranslate2 transcriber

chore(transcription): ctranslate2 assertion

chore(transcription): ctranslate2 assertion

chore(transcription): add preinstall script for Python dependencies

chore(transcription): add download and unzip utils functions

chore(transcription): add download and unzip utils functions

chore(transcription): download & unzip models fixtures

chore(transcription): zip

chore(transcription): raise download file test timeout

chore(transcription): simplify download file test

chore(transcription): add transcriptions test to CI

chore(transcription): raise test preconditions timeout

chore(transcription): run preinstall scripts before running ci

chore(transcription): create dedicated tmp folder for transcriber tests

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): use short video for local model test

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): raise timeout some more

chore(transcription): setup verbosity based on NODE_ENV value
This commit is contained in:
lutangar 2024-03-29 10:34:45 +01:00 committed by Chocobozzz
parent b10482e0e0
commit ef14cf4a5c
No known key found for this signature in database
GPG Key ID: 583A612D890159BE
69 changed files with 2159 additions and 7 deletions

View File

@ -39,7 +39,7 @@ jobs:
strategy:
fail-fast: false
matrix:
test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, cli-plugin, lint, external-plugins ]
test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, transcription, cli-plugin, lint, external-plugins ]
env:
PGUSER: peertube

1
.gitignore vendored
View File

@ -2,6 +2,7 @@
node_modules
*npm-debug.log
yarn-error.log
*-ci.log
.yarn
# Testing

View File

@ -11,6 +11,7 @@
{ "path": "../../packages/ffmpeg" },
{ "path": "../../packages/models" },
{ "path": "../../packages/node-utils" },
{ "path": "../../packages/server-commands" }
{ "path": "../../packages/server-commands" },
{ "path": "../../packages/transcription" },
]
}

View File

@ -25,6 +25,7 @@
],
"scripts": {
"benchmark-server": "tsx --conditions=peertube:tsx ./scripts/benchmark.ts",
"benchmark-transcription": "tsx --conditions=peertube:tsx --tsconfig ./packages/transcription/tsconfig.json ./packages/transcription/src/benchmark.ts",
"build:client": "bash ./scripts/build/client.sh",
"build:embed": "bash ./scripts/build/embed.sh",
"build:peertube-cli": "bash ./scripts/build/peertube-cli.sh",

View File

@ -125,7 +125,7 @@ function secondsToTime (options: {
else if (minutes >= 1) time += formatNumber(minutes) + minuteSymbol
else if (format === 'full') time += '00' + minuteSymbol
seconds %= 60
seconds = Math.round(seconds) % 60
if (seconds >= 1 && seconds < 10 && format === 'full') time += '0' + seconds + secondsSymbol
else if (seconds >= 1) time += formatNumber(seconds) + secondsSymbol
else if (format === 'full') time += '00'
@ -133,6 +133,14 @@ function secondsToTime (options: {
return time
}
function millisecondsToTime (options: {
seconds: number
format: 'short' | 'full' | 'locale-string' // default 'short'
symbol?: string
} | number) {
return secondsToTime(typeof options === 'number' ? options / 1000 : { ...options, seconds: options.seconds / 1000 })
}
// ---------------------------------------------------------------------------
export {
@ -143,7 +151,8 @@ export {
isLastMonth,
isLastWeek,
timeToInt,
secondsToTime
secondsToTime,
millisecondsToTime
}
// ---------------------------------------------------------------------------

37
packages/jiwer/README.md Normal file
View File

@ -0,0 +1,37 @@
JiWER
=====
__JiWER__ CLI NodeJs wrapper.
> *JiWER is a python tool for computing the word-error-rate of ASR systems.*
> https://jitsi.github.io/jiwer/cli/
__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files:
- WER (Word Error Rate)
- CER (Character Error Rate)
Build
-----
```sh
npm run build
```
Usage
-----
```typescript
const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt')
// WER as a percentage, ex: 0.03 -> 3%
console.log(await jiwerCLI.wer())
// CER as a percentage: 0.01 -> 1%
console.log(await jiwerCLI.cer())
// Detailed comparison report
console.log(await jiwerCLI.alignment())
```
Resources
---------
- https://jitsi.github.io/jiwer/
- https://github.com/rapidfuzz/RapidFuzz

View File

@ -0,0 +1,20 @@
{
"name": "@peertube/peertube-jiwer",
"private": true,
"version": "0.0.0",
"main": "dist/index.js",
"files": [ "dist" ],
"exports": {
"types": "./dist/index.d.ts",
"peertube:tsx": "./src/index.ts",
"default": "./dist/index.js"
},
"type": "module",
"devDependencies": {},
"scripts": {
"preinstall": "pip install -r requirements.txt",
"build": "tsc",
"watch": "tsc -w"
},
"dependencies": {}
}

View File

@ -0,0 +1 @@
jiwer==3.0.4

View File

@ -0,0 +1 @@
export * from './jiwer-cli.js'

View File

@ -0,0 +1,69 @@
import { $ } from 'execa'
export class JiwerClI {
referenceFilePath: string
hypothesisFilePath: string
constructor (referenceFilePath: string, hypothesisFilePath: string) {
this.referenceFilePath = referenceFilePath
this.hypothesisFilePath = hypothesisFilePath
}
/**
* @param referenceFilePath Path to new-line delimited text file of reference sentences.
* @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences.
* @param args
*/
static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) {
return [
'--reference',
referenceFilePath,
'--hypothesis',
hypothesisFilePath,
...args
]
}
buildArgs (...args: string[]) {
return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args)
}
/**
* WER: Word Error Rate as a percentage, ex: 0.03 -> 3%
*/
static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}`
return Number(wer)
}
async wer (global = true) {
return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global)
}
/**
* CER: Character Error Rate
*/
static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}`
return Number(cer)
}
async cer (global = true) {
return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global)
}
/**
* Print alignment of each sentence.
*/
static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<string> {
const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}`
return alignment
}
async alignment (global = true) {
return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global)
}
}

View File

@ -0,0 +1,8 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"outDir": "./dist",
"rootDir": "src",
"tsBuildInfoFile": "./dist/.tsbuildinfo"
}
}

View File

@ -1,4 +1,4 @@
import short from 'short-uuid'
import short, { SUUID } from 'short-uuid'
const translator = short()
@ -6,6 +6,10 @@ function buildUUID () {
return short.uuid()
}
function buildSUUID (): SUUID {
return short.generate()
}
function uuidToShort (uuid: string) {
if (!uuid) return uuid
@ -26,7 +30,10 @@ function isShortUUID (value: string) {
export {
buildUUID,
buildSUUID,
uuidToShort,
shortToUUID,
isShortUUID
}
export type { SUUID }

View File

@ -59,6 +59,15 @@ export function makeRawRequest (options: {
return makeGetRequest(reqOptions)
}
export const makeFileRequest = (url: string) => {
return makeRawRequest({
url,
responseType: 'arraybuffer',
redirects: 1,
expectedStatus: HttpStatusCode.OK_200
})
}
export function makeGetRequest (options: CommonRequestParams & {
query?: any
rawQuery?: string

Binary file not shown.

View File

@ -0,0 +1,16 @@
🇫🇷 DRANE Occitanie - Communiquer lors d'une classe transplantée
[./communiquer-lors-dune-classe-transplantee.mp4](communiquer-lors-dune-classe-transplantee.mp4)
> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/
>
> CC BY-NC-SA 4.0 Deed
> Attribution-NonCommercial-ShareAlike 4.0 International
🇫🇷 [Accompagner la victime d'une dérive sectaire ou d'une emprise mentale](https://www.fun-mooc.fr/fr/cours/accompagner-la-victime-de-derive-sectaire/)
> Centre Contre les Manipulations Mentales (CCMM)
> [CC BY-NC-ND 4.0 Deed](https://creativecommons.org/licenses/by-nc-nd/4.0/)
> Attribution-NonCommercial-NoDerivs 4.0 International
🇺🇸 [The Last Man On Earth (1964)](https://archive.org/details/TheLastManOnEarthHD)
> PDM 1.0 Deed
> Public Domain Mark 1.0 Universal
> https://creativecommons.org/publicdomain/mark/1.0/

View File

@ -0,0 +1,10 @@
Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario pédagogique présenté par Monsieur Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
La première application utilisée sera la médiathèque. L'enseignant va alors transférer les différentes photos réalisées lors de la classe transplantée.
Dans un dossier spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans l'ENT, dans la médiathèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utiliseront le blog, à partir de leurs notes, il pourront, seul ou à 2 par poste rédiger un article dans leur ENT.
Ils illustreront ces articles à l'aide des photos et documents numériques mis en accès libre dans l'ENT.
Pour ce faire, il pourront utiliser l'éditeur avancé qui les renverra directement dans la médiathèque de la classe, où ils pourront retrouver le dossier créé par leur enseignant.
Une fois leur article terminé, les élèves soumettront celui-ci au professeur qui pourra soit l'annoter pour correction ou le publier.
Ensuite, il pourront lire et commenter ceux de leurs camarades, ou répondre aux commentaires de la veille.

Binary file not shown.

View File

@ -0,0 +1,165 @@

1
00:00:03,640 --> 00:00:05,640
-Bonjour et bienvenue sur FUN MOOC.
2
00:00:05,960 --> 00:00:09,000
Notre MOOC "Comment parler
à une victime d'emprise mentale
3
00:00:09,320 --> 00:00:10,400
ou de dérive sectaire"
4
00:00:10,720 --> 00:00:13,840
s'adresse à tout professionnel
du domaine de la santé,
5
00:00:14,160 --> 00:00:15,920
de l'associatif, du juridique,
6
00:00:16,240 --> 00:00:18,800
qui pourra être en contact
avec une victime de telles dérives.
7
00:00:21,720 --> 00:00:23,840
Il sera composé de 14 leçons vidéo
8
00:00:24,160 --> 00:00:26,040
d'une dizaine de minutes
9
00:00:26,360 --> 00:00:28,600
divisées en quatre blocs.
10
00:00:31,800 --> 00:00:34,960
Le premier bloc vous informera
de ce que sont exactement
11
00:00:35,280 --> 00:00:37,720
l'emprise mentale
et une dérive sectaire.
12
00:00:38,040 --> 00:00:42,440
-Ça consiste toujours
en une forme de manipulation
13
00:00:43,520 --> 00:00:47,320
qui conduit à une dépendance,
à une sorte de cercle vicieux,
14
00:00:47,640 --> 00:00:51,200
où les personnes ne parviennent pas
à se désengager d'un processus
15
00:00:51,520 --> 00:00:54,120
qui les conduit
soit à donner de l'argent,
16
00:00:54,440 --> 00:00:56,160
à se livrer à des actes
17
00:00:56,480 --> 00:00:58,480
qu'en réalité
ils n'auraient pas acceptés,
18
00:00:58,800 --> 00:01:02,160
ou, tout simplement, à accepter
de participer à une organisation
19
00:01:02,480 --> 00:01:03,760
dont ils ne partagent pas
20
00:01:04,080 --> 00:01:06,040
toutes les méthodes
ou tous les points de vue.
21
00:01:06,360 --> 00:01:10,080
-Le deuxième bloc vous informera
des bonnes techniques d'écoute
22
00:01:10,400 --> 00:01:12,680
d'une personne
ayant vécu de tels traumatismes.
23
00:01:13,000 --> 00:01:14,760
-C'est un sujet actuel
24
00:01:15,080 --> 00:01:17,320
parce que ce phénomène
est en croissance.
25
00:01:17,640 --> 00:01:20,000
Il y a une augmentation très importante,
un doublement,
26
00:01:20,320 --> 00:01:21,400
en l'espace de quelques années,
27
00:01:21,720 --> 00:01:22,960
en moins de 10 ans.
28
00:01:27,200 --> 00:01:31,000
-Le bloc 3, lui,
sera conçu par nos juristes
29
00:01:31,320 --> 00:01:34,080
pour vous indiquer
quelles sont les grandes infractions
30
00:01:34,400 --> 00:01:36,960
en lien avec l'emprise mentale,
31
00:01:37,280 --> 00:01:39,120
et surtout, pouvoir faire
une analyse perspicace
32
00:01:39,440 --> 00:01:41,640
d'une situation individuelle.
33
00:01:43,760 --> 00:01:46,960
Enfin, le bloc 4 vous assistera
34
00:01:47,280 --> 00:01:50,320
pour savoir comment aiguiller
une victime
35
00:01:50,640 --> 00:01:52,400
vers les bons professionnels.
36
00:01:53,160 --> 00:01:54,040
Bonne formation.

View File

@ -0,0 +1,11 @@
-Bonjour et bienvenue sur FUN MOOC.
Notre MOOC "Comment parler à une victime d'emprise mentale ou de dérive sectaire" s'adresse à tout professionnel du domaine de la santé, de l'associatif, du juridique, qui pourra être en contact avec une victime de telles dérives.
Il sera composé de 14 leçons vidéo d'une dizaine de minutes divisées en quatre blocs.
Le premier bloc vous informera de ce que sont exactement l'emprise mentale et une dérive sectaire.
-Ça consiste toujours en une forme de manipulation qui conduit à une dépendance, à une sorte de cercle vicieux, où les personnes ne parviennent pas à se désengager d'un processus qui les conduit soit à donner de l'argent, à se livrer à des actes qu'en réalité ils n'auraient pas acceptés, ou, tout simplement, à accepter de participer à une organisation dont ils ne partagent pas toutes les méthodes ou tous les points de vue.
-Le deuxième bloc vous informera des bonnes techniques d'écoute d'une personne ayant vécu de tels traumatismes.
-C'est un sujet actuel parce que ce phénomène est en croissance.
Il y a une augmentation très importante, un doublement, en l'espace de quelques années, en moins de 10 ans.
-Le bloc 3, lui, sera conçu par nos juristes pour vous indiquer quelles sont les grandes infractions en lien avec l'emprise mentale, et surtout, pouvoir faire une analyse perspicace d'une situation individuelle.
Enfin, le bloc 4 vous assistera pour savoir comment aiguiller une victime vers les bons professionnels.
Bonne formation.

View File

@ -0,0 +1,17 @@
1
00:00:00,000 --> 00:00:01,940
December, 1965.
2
00:00:03,460 --> 00:00:06,660
Is that all it has been since
I inherited the world?
3
00:00:07,020 --> 00:00:08,900
Only three years.
4
00:00:09,940 --> 00:00:11,760
Seems like a hundred million.

View File

@ -0,0 +1,5 @@
December, 1965.
Is that all it has been since
I inherited the world?
Only three years.
It seems like a hundred million.

View File

@ -0,0 +1,14 @@
WEBVTT
00:00.000 --> 00:01.940
December, 1965.
00:03.460 --> 00:06.660
Is that all it has been since I inherited the world?
00:07.020 --> 00:08.900
Only three years.
00:09.940 --> 00:11.760
Seems like a hundred million.

View File

@ -0,0 +1,29 @@
import { millisecondsToTime, secondsToTime } from '@peertube/peertube-core-utils'
import { expect } from 'chai'
describe('Seconds to time', function () {
it('Outputs a human readable time', function () {
expect(secondsToTime(61.1335)).to.equals('1m1s')
})
it('Rounds the number of seconds to the nearest integer', function () {
expect(secondsToTime(61.4)).to.equals('1m1s')
expect(secondsToTime(61.6)).to.equals('1m2s')
expect(secondsToTime(61.51)).to.equals('1m2s')
})
})
describe('Milliseconds to time', function () {
it('Outputs a human readable time', function () {
expect(millisecondsToTime(60_000)).to.equals('1m')
})
it('Rounds the number of seconds to the nearest integer', function () {
expect(millisecondsToTime(60_100)).to.equals('1m')
expect(millisecondsToTime(60_501)).to.equals('1m1s')
})
it('Time inferior to 500ms appears as empty string', function () {
expect(millisecondsToTime(499)).to.equals('')
})
})

View File

@ -0,0 +1,48 @@
/* eslint-disable max-len */
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import { join } from 'path'
import { mkdir, rm, writeFile } from 'node:fs/promises'
import { expect } from 'chai'
import { JiwerClI } from '@peertube/peertube-jiwer'
describe('Jiwer CLI', function () {
const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator')
const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt')
const hypothesis = join(transcriptDirectory, 'openai.txt')
const jiwerCLI = new JiwerClI(referenceTranscriptFilePath, hypothesis)
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
await writeFile(join(transcriptDirectory, 'openai.txt'), `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe il pourront retrouver le dossier créé par leurs enseignants.
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
`)
})
it(`returns coherent wer`, async function () {
const wer = await jiwerCLI.wer()
expect(wer).to.be.below(30 / 100)
expect(wer).to.be.greaterThan(0 / 100)
})
it(`returns coherent cer`, async function () {
const cer = await jiwerCLI.cer()
expect(cer).to.be.below(10 / 100)
expect(cer).to.be.greaterThan(9 / 100)
})
it(`print alignment`, async function () {
console.log(await jiwerCLI.alignment())
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -29,5 +29,7 @@ export const FIXTURE_URLS = {
chatersVideo: 'https://download.cpy.re/peertube/video_chapters.mp4',
file4K: 'https://download.cpy.re/peertube/4k_file.txt'
file4K: 'https://download.cpy.re/peertube/4k_file.txt',
transcriptionModels: 'https://download.cpy.re/peertube/transcription-models.zip'
}

View File

@ -0,0 +1,18 @@
import { expect } from 'chai'
import { levenshteinDistance } from '@peertube/peertube-transcription'
describe('Levenshtein distance', function () {
it(`equals 1 when there is only one character difference`, function () {
expect(levenshteinDistance('abcd', 'abce')).equals(1)
})
it(`may calculate a distance on a txt subtitle content `, function () {
expect(levenshteinDistance(`December, 1965.
Is that all it has been since
I inherited the world?
Only three years.
Seems like a hundred million.
`, 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.')).equals(13)
})
})

View File

@ -0,0 +1,33 @@
import { srtToTxt } from '@peertube/peertube-transcription'
import { expect } from 'chai'
describe('srt to txt', function () {
it(`Transforms the content of a srt subtitle to a pure text version`, function () {
const txt = srtToTxt(`1
00:00:00,000 --> 00:00:01,940
December, 1965.
2
00:00:03,460 --> 00:00:06,660
Is that all it has been since
I inherited the world?
3
00:00:07,020 --> 00:00:08,900
Only three years.
4
00:00:09,940 --> 00:00:11,760
Seems like a hundred million.
`)
expect(txt).equals(`December, 1965.
Is that all it has been since
I inherited the world?
Only three years.
Seems like a hundred million.
`)
})
})

View File

@ -0,0 +1,17 @@
import { transcriberFactory } from '@peertube/peertube-transcription'
describe('Transcriber factory', function () {
const transcribers = [
'openai-whisper',
'whisper-ctranslate2',
'whisper-timestamped'
]
describe('Should be able to create a transcriber for each available transcription engine', function () {
transcribers.forEach(function (transcriberName) {
it(`Should be able to create a(n) ${transcriberName} transcriber`, function () {
transcriberFactory.createFromEngineName(transcriberName)
})
})
})
})

View File

@ -0,0 +1,67 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, no-new, max-len */
import { TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import { join } from 'node:path'
import { mkdir, rm } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { expect } from 'chai'
describe('Transcript File Evaluator', function () {
const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file-evaluator')
const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt')
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
})
it(`may not compare files in another format than txt`, async function () {
const vttReference = await TranscriptFile.write({
path: join(transcriptDirectory, 'reference.vtt'),
format: 'vtt',
content: ''
})
const vttHypothesis = await TranscriptFile.write({
path: join(transcriptDirectory, 'hypothesis.vtt'),
format: 'vtt',
content: ''
})
expect(() => new TranscriptFileEvaluator(vttReference, vttHypothesis)).to.throw('Can only evaluate txt transcript file')
})
it(`evaluation must return coherent wer & cer`, async function () {
const reference = new TranscriptFile({
path: referenceTranscriptFilePath,
language: 'fr',
format: 'txt'
})
const hypothesis = await TranscriptFile.write({
path: join(transcriptDirectory, 'openai.txt'),
content: `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe il pourront retrouver le dossier créé par leurs enseignants.
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
`,
format: 'txt',
language: 'fr'
})
const evaluator = new TranscriptFileEvaluator(reference, hypothesis)
const wer = await evaluator.wer()
expect(wer).to.be.greaterThan(0 / 100)
expect(wer).to.be.below(30 / 100)
const cer = await evaluator.cer()
expect(cer).to.be.greaterThan(9 / 100)
expect(cer).to.be.below(10 / 100)
console.log(await evaluator.alignment())
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,44 @@
/* eslint-disable @typescript-eslint/no-unused-expressions */
import { expect } from 'chai'
import { join } from 'node:path'
import { mkdir, rm } from 'node:fs/promises'
import { TranscriptFile } from '@peertube/peertube-transcription'
import { tmpdir } from 'node:os'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
describe('Transcript File', function () {
const transcriptFileDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file')
before(async function () {
await mkdir(transcriptFileDirectory, { recursive: true })
})
it(`may creates a new transcript file from scratch`, async function () {
const transcript1 = await TranscriptFile.write({
path: join(transcriptFileDirectory, 'test1.txt'),
content: 'test2',
format: 'txt'
})
const transcript2 = await TranscriptFile.write({
path: join(transcriptFileDirectory, 'test2.txt'),
content: 'test2',
format: 'txt'
})
expect(await transcript1.equals(transcript2)).to.be.true
})
it(`may creates a txt transcript file object from a transcript without providing the format explicitly`, function () {
TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.srt'), 'en')
TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.txt'), 'en')
})
it(`fails when loading a file which is obviously not a transcript`, function () {
expect(() => TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4'), 'en'))
.to.throw(`Couldn't guess transcript format from extension "mp4". Valid formats are: txt, vtt, srt.`)
})
after(async function () {
await rm(transcriptFileDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1 @@
describe('Transcription run', function () {})

View File

@ -0,0 +1,44 @@
import { cp, lstat, mkdir, rm } from 'node:fs/promises'
import { join } from 'node:path'
import { tmpdir } from 'node:os'
import { expect } from 'chai'
import { downloadFile, unzip } from '@peertube/peertube-transcription'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
describe('downloadFile', function () {
const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils')
before(async function () {
await mkdir(testDirectory, { recursive: true })
})
it(`Downloads a file and write it to the disk `, async function () {
const filePath = await downloadFile('https://download.cpy.re/peertube/4k_file.txt', testDirectory)
expect(await lstat(filePath).then(stats => stats.isFile())).equals(true)
})
after(async function () {
await rm(testDirectory, { recursive: true, force: true })
})
})
describe('unzip', function () {
const zipFixtureFileName = 'hello_world.zip'
const zipFixtureFilePath = buildAbsoluteFixturePath(`transcription/${zipFixtureFileName}`)
const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils')
before(async function () {
await mkdir(testDirectory, { recursive: true })
})
it(`Extract zip archive to directory`, async function () {
const zipFilePath = join(testDirectory, zipFixtureFileName)
await cp(zipFixtureFilePath, zipFilePath)
const unzippedDirectory = await unzip(zipFilePath)
expect(await lstat(unzippedDirectory).then(stats => stats.isDirectory())).equals(true)
})
after(async function () {
await rm(testDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,125 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'node:path'
import { mkdir, rm } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import {
downloadFile,
levenshteinDistance,
OpenaiTranscriber,
TranscriptFile,
TranscriptFileEvaluator,
TranscriptionModel,
unzip,
WhisperBuiltinModel
} from '@peertube/peertube-transcription'
import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
config.truncateThreshold = 0
describe('Open AI Whisper transcriber', function () {
const tmpDirectory = join(tmpdir(), 'peertube-transcription')
const transcriptDirectory = join(tmpDirectory, 'transcriber', 'openai')
const modelsDirectory = join(tmpDirectory, 'models')
const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const referenceTranscriptFile = new TranscriptFile({
path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
language: 'fr',
format: 'txt'
})
const transcriber = new OpenaiTranscriber(
{
name: 'openai-whisper',
requirements: [],
type: 'binary',
binary: 'whisper',
supportedModelFormats: [ 'PyTorch' ],
languageDetection: true
},
createLogger(),
transcriptDirectory
)
before(async function () {
this.timeout(1 * 1000 * 60)
await mkdir(transcriptDirectory, { recursive: true })
await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
})
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
this.timeout(3 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
expect(transcript.format).to.equals('vtt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May produce a transcript file in the `srt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
expect(transcript.format).to.equals('srt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May produce a transcript file in the `txt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
expect(levenshteinDistance(
(await transcript.read()).toString(),
'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
)).to.be.below(3)
})
it('May transcribe a media file using a local PyTorch model', async function () {
this.timeout(2 * 1000 * 60)
await transcriber.transcribe({
mediaFilePath: shortVideoPath,
model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
language: 'en'
})
})
it('May transcribe a media file in french', async function () {
this.timeout(3 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('fr')
expect(await transcript.read()).not.to.be.empty
})
it('Guesses the video language if not provided', async function () {
this.timeout(3 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
expect(transcript.language).to.equals('fr')
})
it('May transcribe a media file in french with small model', async function () {
this.timeout(6 * 1000 * 60)
const transcript = await transcriber.transcribe({
mediaFilePath: frVideoPath,
language: 'fr',
format: 'txt',
model: new WhisperBuiltinModel('small')
})
expect(transcript.language).to.equals('fr')
const transcriptFileEvaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcript)
const cer = await transcriptFileEvaluator.cer()
expect(cer).to.be.below(6 / 100)
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,133 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'node:path'
import { mkdir, rm } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import {
OpenaiTranscriber,
WhisperTimestampedTranscriber,
TranscriptFileEvaluator,
TranscriptionModel,
WhisperTranscribeArgs,
levenshteinDistance, downloadFile, unzip
} from '@peertube/peertube-transcription'
import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
config.truncateThreshold = 0
describe('Linto timestamped Whisper transcriber', function () {
const tmpDirectory = join(tmpdir(), 'peertube-transcription')
const transcriptDirectory = join(tmpDirectory, 'transcriber', 'timestamped')
const modelsDirectory = join(tmpDirectory, 'models')
const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const transcriber = new WhisperTimestampedTranscriber(
{
name: 'whisper-timestamped',
requirements: [],
type: 'binary',
binary: 'whisper_timestamped',
supportedModelFormats: [ 'PyTorch' ],
languageDetection: true
},
createLogger(),
transcriptDirectory
)
before(async function () {
this.timeout(1 * 1000 * 60)
await mkdir(transcriptDirectory, { recursive: true })
await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
})
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
this.timeout(1 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
expect(transcript.format).to.equals('vtt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May produce a transcript file in the `srt` format with a ms precision', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
expect(transcript.format).to.equals('srt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May produce a transcript file in `txt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
expect(levenshteinDistance(
(await transcript.read()).toString(),
'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
)).to.be.below(10)
})
it('May transcribe a media file using a local PyTorch model file', async function () {
this.timeout(2 * 1000 * 60)
await transcriber.transcribe({
mediaFilePath: shortVideoPath,
model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
language: 'en'
})
})
it('May transcribe a media file in french', async function () {
this.timeout(2 * 1000 * 60)
const transcript = await transcriber.transcribe({
mediaFilePath: frVideoPath,
language: 'fr',
format: 'txt'
})
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('fr')
expect(await transcript.read()).not.to.be.empty
})
it('Guesses the video language if not provided', async function () {
this.timeout(2 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
expect(transcript.language).to.equals('fr')
})
it('Should produce a text transcript similar to openai-whisper implementation', async function () {
this.timeout(11 * 1000 * 60)
const transcribeArgs: WhisperTranscribeArgs = {
mediaFilePath: frVideoPath,
model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
language: 'fr',
format: 'txt'
}
const transcript = await transcriber.transcribe(transcribeArgs)
const openaiTranscriber = new OpenaiTranscriber(
{
name: 'openai-whisper',
requirements: [],
type: 'binary',
binary: 'whisper',
supportedModelFormats: [ 'PyTorch' ]
},
createLogger(),
join(transcriptDirectory, 'openai-whisper')
)
const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs)
const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript)
expect(await transcriptFileEvaluator.wer()).to.be.below(25 / 100)
expect(await transcriptFileEvaluator.cer()).to.be.below(15 / 100)
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,137 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'node:path'
import { mkdir, rm } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import {
Ctranslate2Transcriber, downloadFile,
levenshteinDistance,
OpenaiTranscriber,
TranscriptFile,
TranscriptFileEvaluator,
TranscriptionModel, unzip,
WhisperTranscribeArgs
} from '@peertube/peertube-transcription'
import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
config.truncateThreshold = 0
describe('Whisper CTranslate2 transcriber', function () {
const tmpDirectory = join(tmpdir(), 'peertube-transcription')
const transcriptDirectory = join(tmpDirectory, 'transcriber', 'ctranslate2')
const modelsDirectory = join(tmpDirectory, 'models')
const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const transcriber = new Ctranslate2Transcriber(
{
name: 'anyNameShouldBeFineReally',
requirements: [],
type: 'binary',
binary: 'whisper-ctranslate2',
supportedModelFormats: [],
languageDetection: true
},
createLogger(),
transcriptDirectory
)
before(async function () {
this.timeout(1 * 1000 * 60)
await mkdir(transcriptDirectory, { recursive: true })
await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
})
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
expect(transcript.format).to.equals('vtt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May produce a transcript file in the `srt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
expect(transcript.format).to.equals('srt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May produce a transcript file in the `txt` format', async function () {
const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
format: 'txt',
language: 'en'
}))).to.be.true
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
expect(levenshteinDistance(
(await transcript.read()).toString(),
'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
)).to.be.below(5)
})
it('May transcribe a media file using a local CTranslate2 model', async function () {
this.timeout(2 * 1000 * 60)
const transcript = await transcriber.transcribe({
mediaFilePath: shortVideoPath,
model: await TranscriptionModel.fromPath(join(modelsDirectory, 'faster-whisper-tiny')),
language: 'en',
format: 'txt'
})
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('en')
expect(await transcript.read()).not.to.be.empty
})
it('May transcribe a media file in french', async function () {
this.timeout(5 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
expect(transcript.format).to.equals('txt')
expect(transcript.language).to.equals('fr')
expect(await transcript.read()).not.to.be.empty
})
it('Guesses the video language if not provided', async function () {
this.timeout(2 * 1000 * 60)
const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
expect(transcript.language).to.equals('fr')
})
it('Should produce a text transcript similar to openai-whisper implementation', async function () {
this.timeout(10 * 1000 * 60)
const transcribeArgs: WhisperTranscribeArgs = {
mediaFilePath: frVideoPath,
language: 'fr',
format: 'txt'
}
const transcript = await transcriber.transcribe(transcribeArgs)
const openaiTranscriber = new OpenaiTranscriber(
{
name: 'openai-whisper',
requirements: [],
type: 'binary',
binary: 'whisper',
supportedModelFormats: [ 'PyTorch' ]
},
createLogger(),
join(transcriptDirectory, 'openai-whisper')
)
const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs)
const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript)
expect(await transcriptFileEvaluator.wer()).to.be.below(20 / 100)
expect(await transcriptFileEvaluator.cer()).to.be.below(10 / 100)
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -6,16 +6,20 @@
"tsBuildInfoFile": "./dist/.tsbuildinfo",
"paths": {
"@tests/*": [ "./src/*" ],
"@server/*": [ "../../server/core/*" ]
"@server/*": [ "../../server/core/*" ],
"@peertube/peertube-transcription": [ "../transcription" ],
"@peertube/peertube-jiwer": [ "../jiwer" ],
}
},
"references": [
{ "path": "../core-utils" },
{ "path": "../ffmpeg" },
{ "path": "../jiwer" },
{ "path": "../models" },
{ "path": "../node-utils" },
{ "path": "../typescript-utils" },
{ "path": "../server-commands" },
{ "path": "../transcription" },
{ "path": "../../server/tsconfig.lib.json" }
],
"include": [

View File

@ -0,0 +1,99 @@
# Transcription
Video **transcription** consists in transcribing the audio content of a video to a text.
> This process might be called __Automatic Speech Recognition__ or __Speech to Text__ in more general context.
Provide a common API to many transcription backend, currently :
- `openai-whisper` CLI
- `faster-whisper` (*via* `whisper-ctranslate2` CLI)
- `whisper-timestamped`
> Potential candidates could be: whisper-cpp, vosk, ...
## Requirements
- Python
- PIP
And at least one of the following transcription backend:
- Python :
- `openai-whisper`
- `whisper-ctranslate2>=0.4.3`
- `whisper-timestamped>=1.15.4`
And to run the transcript evaluation tests :
- Python
- `jiwer>=3.04`
## Usage
Create a transcriber manually :
```typescript
import { OpenaiTranscriber } from '@peertube/peertube-transcription'
(async () => {
// create a transcriber powered by OpeanAI Whisper CLI
const transcriber = new OpenaiTranscriber({
name: 'openai-whisper',
binary: 'whisper',
languageDetection: true
});
const transcriptFile = await transcriber.transcribe({
mediaFilePath: './myVideo.mp4',
format: 'txt'
});
console.log(transcriptFile.path);
console.log(await transcriptFile.read());
})();
```
Using a local model file:
```typescript
import { WhisperBuiltinModel } from '@peertube/peertube-transcription/dist'
const transcriptFile = await transcriber.transcribe({
mediaFilePath: './myVideo.mp4',
model: WhisperBuiltinModel.fromPath('./models/large.pt'),
format: 'txt'
});
```
You may use the builtin Factory if you're happy with the default configuration:
```Typescript
import { transcriberFactory } from '@peertube/peertube-transcription'
transcriberFactory.createFromEngineName('openai-whisper')
```
> For further usage [../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts](../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts)
## Benchmark
A benchmark of available __transcribers__ might be run with:
```sh
npm run benchmark
```
```
┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐
│ (index) │ WER │ CER │ duration │ model │ engine │
├────────────────────────┼───────────────────────┼───────────────────────┼──────────┼────────┼───────────────────────┤
│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%' │ '9.62457337883959%' │ '41s' │ 'tiny' │ 'openai-whisper' │
│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%' │ '10.46195652173913%' │ '15s' │ 'tiny' │ 'whisper-ctranslate2' │
│ qbt6BekKMVzxq4KCSLCzt3 │ '31.020408163265305%' │ '10.784982935153584%' │ '20s' │ 'tiny' │ 'whisper-timestamped' │
└────────────────────────┴───────────────────────┴───────────────────────┴──────────┴────────┴───────────────────────┘
```
The benchmark may be run with multiple model builtin sizes:
```sh
MODELS=tiny,small,large npm run benchmark
```
## Lexicon
- ONNX: Open Neural Network eXchange. A specification, the ONNX Runtime run these models.
- GPTs: Generative Pre-Trained Transformers
- LLM: Large Language Models
- NLP: Natural Language Processing
- MLP: Multilayer Perceptron
- ASR: Automatic Speech Recognition
- WER: Word Error Rate
- CER: Character Error Rate

View File

@ -0,0 +1,21 @@
{
"name": "@peertube/peertube-transcription",
"private": true,
"version": "0.0.0",
"main": "dist/index.js",
"files": [ "dist" ],
"exports": {
"types": "./dist/index.d.ts",
"peertube:tsx": "./src/index.ts",
"default": "./dist/index.js"
},
"type": "module",
"devDependencies": {},
"scripts": {
"preinstall": "pip install -r requirements.txt",
"build": "tsc",
"watch": "tsc -w",
"benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts"
},
"dependencies": {}
}

View File

@ -0,0 +1,3 @@
openai-whisper==20231117
whisper-ctranslate2==0.4.4
whisper-timestamped==1.15.4

View File

@ -0,0 +1,69 @@
import { createLogger, Logger } from 'winston'
import { join } from 'node:path'
import { PerformanceObserver } from 'node:perf_hooks'
import { buildSUUID, SUUID, root } from '@peertube/peertube-node-utils'
import { TranscriptionEngine } from './transcription-engine.js'
import { TranscriptionModel } from './transcription-model.js'
import { TranscriptionRun } from './transcription-run.js'
import { TranscriptFile, TranscriptFormat } from './transcript/index.js'
export interface TranscribeArgs {
mediaFilePath: string
model: TranscriptionModel
language?: string
format?: TranscriptFormat
runId?: SUUID
}
export abstract class AbstractTranscriber {
public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
engine: TranscriptionEngine
logger: Logger
transcriptDirectory: string
performanceObserver?: PerformanceObserver
run?: TranscriptionRun
constructor (
engine: TranscriptionEngine,
logger: Logger = createLogger(),
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY,
performanceObserver?: PerformanceObserver
) {
this.engine = engine
this.logger = logger
this.transcriptDirectory = transcriptDirectory
this.performanceObserver = performanceObserver
}
createRun (uuid: SUUID = buildSUUID()) {
this.run = new TranscriptionRun(this.logger, uuid)
}
startRun () {
this.run.start()
}
stopRun () {
this.run.stop()
delete this.run
}
assertLanguageDetectionAvailable (language?: string) {
if (!this.engine.languageDetection && !language) {
throw new Error(`Language detection isn't available in ${this.engine.name}. A language must me provided explicitly.`)
}
}
supports (model: TranscriptionModel) {
return model.format === 'PyTorch'
}
abstract transcribe ({
mediaFilePath,
model,
language,
format = 'vtt',
runId = buildSUUID()
}: TranscribeArgs): Promise<TranscriptFile>
}

View File

@ -0,0 +1,139 @@
import { createLogger, transports, format } from 'winston'
import { join } from 'node:path'
import { performance, PerformanceObserver } from 'node:perf_hooks'
import { tmpdir } from 'node:os'
import { rm, mkdir } from 'node:fs/promises'
import { buildAbsoluteFixturePath, buildSUUID, SUUID } from '@peertube/peertube-node-utils'
import {
transcriberFactory,
TranscriptFile,
TranscriptFileEvaluator,
TranscriptionEngine,
TranscriptionModel
} from '@peertube/peertube-transcription'
import { millisecondsToTime } from '@peertube/peertube-core-utils'
interface BenchmarkResult {
uuid: SUUID
WER?: number
CER?: number
duration?: number
engine?: TranscriptionEngine
model?: string
}
type Benchmark = Record<SUUID, BenchmarkResult>
const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({
...benchmark,
[benchmarkResult.uuid]: {
...benchmark[benchmarkResult.uuid],
...benchmarkResult
}
})
const groupBenchmarkResultsByModel = (benchmarkResults: Record<string, BenchmarkResult>) => (benchmarksGroupedByModel, uuid) => ({
...benchmarksGroupedByModel,
[benchmarkResults[uuid].model]: {
...benchmarksGroupedByModel[benchmarkResults[uuid].model],
[uuid]: formatBenchmarkResult(benchmarkResults[uuid])
}
})
interface FormattedBenchmarkResult {
WER?: string
CER?: string
duration?: string
model?: string
engine?: string
}
const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial<BenchmarkResult>): FormattedBenchmarkResult => ({
WER: WER ? `${WER * 100}%` : undefined,
CER: CER ? `${CER * 100}%` : undefined,
duration: duration ? millisecondsToTime(duration) : undefined,
model,
engine: engine.name
})
void (async () => {
const logger = createLogger()
logger.add(new transports.Console({ format: format.printf(log => log.message) }))
const transcribers = [
'openai-whisper',
'whisper-ctranslate2',
'whisper-timestamped'
]
const models = process.env.MODELS
? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName)
: [ 'tiny' ]
const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark')
const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
const referenceTranscriptFile = new TranscriptFile({
path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
language: 'fr',
format: 'txt'
})
let benchmarkResults: Record<string, BenchmarkResult> = {}
// before
await mkdir(transcriptDirectory, { recursive: true })
const performanceObserver = new PerformanceObserver((items) => {
items
.getEntries()
.forEach((entry) => {
benchmarkResults = benchmarkReducer(benchmarkResults, {
uuid: entry.name as SUUID,
duration: entry.duration
})
})
})
performanceObserver.observe({ type: 'measure' })
// benchmark
logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`)
for (const transcriberName of transcribers) {
logger.info(`Create "${transcriberName}" transcriber for the benchmark...`)
const transcriber = transcriberFactory.createFromEngineName(
transcriberName,
createLogger(),
transcriptDirectory
)
for (const modelName of models) {
logger.info(`Run benchmark with "${modelName}" model:`)
const model = new TranscriptionModel(modelName)
const uuid = buildSUUID()
const transcriptFile = await transcriber.transcribe({
mediaFilePath,
model,
language: 'fr',
format: 'txt',
runId: uuid
})
const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
await new Promise(resolve => setTimeout(resolve, 1))
benchmarkResults = benchmarkReducer(benchmarkResults, {
uuid,
engine: transcriber.engine,
WER: await evaluator.wer(),
CER: await evaluator.cer(),
model: model.name
})
}
}
// display
const benchmarkResultsGroupedByModel = Object
.keys(benchmarkResults)
.reduce(groupBenchmarkResultsByModel(benchmarkResults), {})
Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark))
// after
await rm(transcriptDirectory, { recursive: true, force: true })
performance.clearMarks()
})()

View File

@ -0,0 +1,13 @@
import { TranscriberFactory } from './transcriber-factory.js'
import { engines } from './whisper/index.js'
export * from './transcript/index.js'
export * from './levenshtein.js'
export * from './subtitle.js'
export * from './transcription-engine.js'
export * from './transcription-model.js'
export * from './transcription-run.js'
export * from './utils.js'
export * from './whisper/index.js'
export const transcriberFactory = new TranscriberFactory(engines)

View File

@ -0,0 +1,101 @@
function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
return d0 < d1 || d2 < d1
? d0 > d2
? d2 + 1
: d0 + 1
: bx === ay
? d1
: d1 + 1
}
/**
* @see https://github.com/gustf/js-levenshtein
*/
export function levenshteinDistance (a: string, b: string): number {
if (a === b) {
return 0
}
if (a.length > b.length) {
const tmp = a
a = b
b = tmp
}
let la = a.length
let lb = b.length
while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
la--
lb--
}
let offset = 0
while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
offset++
}
la -= offset
lb -= offset
if (la === 0 || lb < 3) {
return lb
}
let x = 0
let y: number
let d0: number
let d1: number
let d2: number
let d3: number
let dd: number
let dy: number
let ay: number
let bx0: number
let bx1: number
let bx2: number
let bx3: number
const vector: number[] = []
for (y = 0; y < la; y++) {
vector.push(y + 1)
vector.push(a.charCodeAt(offset + y))
}
const len = vector.length - 1
for (; x < lb - 3;) {
bx0 = b.charCodeAt(offset + (d0 = x))
bx1 = b.charCodeAt(offset + (d1 = x + 1))
bx2 = b.charCodeAt(offset + (d2 = x + 2))
bx3 = b.charCodeAt(offset + (d3 = x + 3))
dd = (x += 4)
for (y = 0; y < len; y += 2) {
dy = vector[y]
ay = vector[y + 1]
d0 = min(dy, d0, d1, bx0, ay)
d1 = min(d0, d1, d2, bx1, ay)
d2 = min(d1, d2, d3, bx2, ay)
dd = min(d2, d3, dd, bx3, ay)
vector[y] = dd
d3 = d2
d2 = d1
d1 = d0
d0 = dy
}
}
for (; x < lb;) {
bx0 = b.charCodeAt(offset + (d0 = x))
dd = ++x
for (y = 0; y < len; y += 2) {
dy = vector[y]
vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
d0 = dy
}
}
return dd
}

View File

@ -0,0 +1 @@
export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '')

View File

@ -0,0 +1,49 @@
import { Logger, createLogger } from 'winston'
import { TranscriptionEngine } from './transcription-engine.js'
import {
Ctranslate2Transcriber,
OpenaiTranscriber, WhisperTimestampedTranscriber
} from './whisper/index.js'
import { AbstractTranscriber } from './abstract-transcriber.js'
export class TranscriberFactory {
engines: TranscriptionEngine[]
constructor (engines: TranscriptionEngine[]) {
this.engines = engines
}
createFromEngineName (
engineName: string,
logger: Logger = createLogger(),
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
) {
const engine = this.getEngineByName(engineName)
const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [
engine,
logger,
transcriptDirectory
]
switch (engineName) {
case 'openai-whisper':
return new OpenaiTranscriber(...transcriberArgs)
case 'whisper-ctranslate2':
return new Ctranslate2Transcriber(...transcriberArgs)
case 'whisper-timestamped':
return new WhisperTimestampedTranscriber(...transcriberArgs)
default:
throw new Error(`Unimplemented engine ${engineName}`)
}
}
getEngineByName (engineName: string) {
const engine = this.engines.find(({ name }) => name === engineName)
if (!engine) {
throw new Error(`Unknow engine ${engineName}`)
}
return engine
}
}

View File

@ -0,0 +1,3 @@
export * from './transcript-file.js'
export * from './transcript-file-evaluator.js'
export * from './transcript-file-interface.js'

View File

@ -0,0 +1,12 @@
export interface TranscriptFileEvaluation {
wer: number
cer: number
alignment: string
}
export interface TranscriptFileEvaluatorInterface {
wer(): Promise<number>
cer(): Promise<number>
alignment(): Promise<string>
evaluate(): Promise<TranscriptFileEvaluation>
}

View File

@ -0,0 +1,46 @@
import assert from 'node:assert'
import { JiwerClI } from '@peertube/peertube-jiwer'
import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js'
import { TranscriptFileInterface } from './index.js'
export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface {
referenceTranscriptFile: TranscriptFileInterface
hypothesisTranscriptFile: TranscriptFileInterface
jiwerCLI: JiwerClI
constructor (referenceTranscriptFile: TranscriptFileInterface, hypothesisTranscriptFile: TranscriptFileInterface) {
assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
this.referenceTranscriptFile = referenceTranscriptFile
this.hypothesisTranscriptFile = hypothesisTranscriptFile
this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path)
}
/**
* WER: Word Error Rate
*/
wer () {
return this.jiwerCLI.wer()
}
/**
* CER: Character Error Rate
*/
cer () {
return this.jiwerCLI.cer()
}
alignment () {
return this.jiwerCLI.alignment()
}
async evaluate () {
return {
wer: await this.wer(),
cer: await this.cer(),
alignment: await this.alignment()
}
}
}

View File

@ -0,0 +1,3 @@
export type TranscriptFormat = 'txt' | 'vtt' | 'srt' | 'json'
export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat }

View File

@ -0,0 +1,88 @@
import { statSync } from 'node:fs'
import { readFile, writeFile } from 'node:fs/promises'
import { extname } from 'node:path'
import assert from 'node:assert'
import { TranscriptFileInterface, TranscriptFormat } from './transcript-file-interface.js'
import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
import { srtToTxt } from '../subtitle.js'
import { levenshteinDistance } from '../levenshtein.js'
export class TranscriptFile implements TranscriptFileInterface {
path: string
language: string
format: TranscriptFormat = 'vtt'
constructor ({ path, language, format = 'vtt' }: { path: string, language: string, format?: TranscriptFormat }) {
statSync(path)
this.path = path
this.language = language
this.format = format
}
/**
* Asynchronously reads the entire contents of a transcript file.
* @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options
*/
async read (options: Parameters<typeof readFile>[1] = 'utf8') {
return await readFile(this.path, options)
}
static fromPath (path: string, language = 'en') {
const format = extname(path).substring(1)
const guessableFormats = [ 'txt', 'vtt', 'srt' ]
assert(
guessableFormats.includes(format),
`Couldn't guess transcript format from extension "${format}". Valid formats are: ${guessableFormats.join(', ')}."`)
return new TranscriptFile({ path, language, format: format as TranscriptFormat })
}
/**
* Write a transcript file to disk.
*/
static async write ({
path,
content,
language = 'en',
format = 'vtt'
}: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise<TranscriptFile> {
await writeFile(path, content)
return new TranscriptFile({ path, language, format })
}
async equals (transcript: TranscriptFile, caseSensitive: boolean = true) {
if (this.language !== transcript.language) {
return false
}
const content = await this.read()
const transcriptContent = await transcript.read()
if (!caseSensitive) {
return String(content).toLowerCase() === String(transcriptContent).toLowerCase()
}
return content === transcriptContent
}
cer (transcript: TranscriptFile) {
return (new TranscriptFileEvaluator(this, transcript)).cer()
}
async evaluate (transcript: TranscriptFile) {
const evaluator = new TranscriptFileEvaluator(this, transcript)
return evaluator.evaluate()
}
async readAsTxt () {
return srtToTxt(String(await this.read()))
}
async distance (transcript: TranscriptFile) {
return levenshteinDistance(await this.readAsTxt(), await transcript.readAsTxt())
}
}

View File

@ -0,0 +1,23 @@
import { ModelFormat } from './transcription-model.js'
/**
* The engine, or framework.
*/
export class TranscriptionEngine {
name: string
description?: string
language?: string
requirements: string[]
type: 'binary' | 'bindings' | 'ws'
binary: string
license?: string
forgeURL?: string
supportedModelFormats: ModelFormat[]
languageDetection?: true
// There could be a default models.
// There could be a list of default models
constructor (parameters: TranscriptionEngine) {
Object.assign(this, parameters)
}
}

View File

@ -0,0 +1,34 @@
import assert from 'node:assert'
import { stat } from 'node:fs/promises'
import { parse } from 'node:path'
export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
export class TranscriptionModel {
name: string
format?: ModelFormat
path?: string
// # - hparams
// # - Number of dimensions (int)
// # - Name length (int)
// # - Dimensions (int[n_dims])
// # - Name (char[name_length])
// # - Data (float[n_dims])
// # - mel filters
// # - tokenizer vocab
// # - model variables
constructor (name: string, path?: string, format?: ModelFormat) {
this.name = name
this.path = path
this.format = format
}
static async fromPath (path: string) {
assert(await stat(path), `${path} doesn't exist.`)
return new TranscriptionModel(parse(path).name, path)
}
}

View File

@ -0,0 +1,41 @@
import { buildSUUID, SUUID } from '@peertube/peertube-node-utils'
import { createLogger, Logger } from 'winston'
export class TranscriptionRun {
uuid: SUUID
logger: Logger
constructor (logger = createLogger(), uuid: SUUID = buildSUUID()) {
this.uuid = uuid
this.logger = logger
}
get runId () {
return this.uuid
}
start () {
performance.mark(this.getStartPerformanceMarkName())
}
stop () {
try {
performance.mark(this.getEndPerformanceMarkName())
performance.measure(
this.runId,
this.getStartPerformanceMarkName(),
this.getEndPerformanceMarkName()
)
} catch (e) {
this.logger.log({ level: 'error', message: e })
}
}
getStartPerformanceMarkName () {
return `${this.runId}-started`
}
getEndPerformanceMarkName () {
return `${this.runId}-ended`
}
}

View File

@ -0,0 +1,32 @@
import { join, parse } from 'node:path'
import { createWriteStream } from 'node:fs'
import { lstat, unlink } from 'node:fs/promises'
import assert from 'node:assert'
import { $ } from 'execa'
import { makeFileRequest } from '@peertube/peertube-server-commands'
export const downloadFile = async (url: string, targetDirectory: string) => {
const { base } = parse(url)
const filePath = join(targetDirectory, base)
const fileStream = createWriteStream(filePath)
const stream = makeFileRequest(url).pipe(fileStream)
return await new Promise((resolve: (filePath: string) => void, reject) => {
stream.on('finish', () => resolve(filePath))
stream.on('error', async e => {
fileStream.close()
await unlink(filePath)
reject(e.message)
})
})
}
export const unzip = async (zipFilePath: string) => {
assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`)
const { dir, name } = parse(zipFilePath)
await $`unzip -o ${zipFilePath} -d ${dir}`
return join(dir, name)
}

View File

@ -0,0 +1,51 @@
import { TranscriptionEngine } from '../transcription-engine.js'
export const engines: TranscriptionEngine[] = [
{
name : 'whisper-cpp',
description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
type: 'binary',
binary: 'main',
language : 'cpp',
requirements : [],
forgeURL : 'https://github.com/ggerganov/whisper.cpp',
license : 'MIT',
supportedModelFormats: [ 'ONNX' ]
},
{
name: 'openai-whisper',
description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
requirements: [ 'python', 'pyTorch', 'ffmpeg' ],
language: 'python',
type: 'binary',
binary: 'whisper',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'PyTorch' ],
languageDetection: true
},
{
name: 'whisper-ctranslate2',
description: '',
requirements: [ 'python' ],
language: 'python',
type: 'binary',
binary: 'whisper-ctranslate2',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'CTranslate2' ],
languageDetection: true
},
{
name: 'whisper-timestamped',
description: '',
requirements: [ 'python' ],
language: 'python',
type: 'binary',
binary: 'whisper_timestamped',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'CTranslate2' ],
languageDetection: true
}
]

View File

@ -0,0 +1,3 @@
export * from './transcriber/index.js'
export * from './engines.js'
export * from './whisper-builtin-model.js'

View File

@ -0,0 +1,49 @@
import { $ } from 'execa'
import { buildSUUID } from '@peertube/peertube-node-utils'
import { lstat } from 'node:fs/promises'
import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
import { TranscriptFile } from '../../transcript/index.js'
import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
import assert from 'node:assert'
export class Ctranslate2Transcriber extends OpenaiTranscriber {
async transcribe ({
mediaFilePath,
model = new WhisperBuiltinModel('tiny'),
language,
format = 'vtt',
runId = buildSUUID()
}: WhisperTranscribeArgs): Promise<TranscriptFile> {
this.assertLanguageDetectionAvailable(language)
const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
if (model.path) {
assert(await lstat(model.path).then(stats => stats.isDirectory()), 'Model path must be a path to a directory.')
}
const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
const languageArgs = language ? [ '--language', language ] : []
this.createRun(runId)
this.startRun()
await $$`${this.engine.binary} ${[
mediaFilePath,
...modelArgs,
'--word_timestamps',
'True',
'--output_format',
'all',
'--output_dir',
this.transcriptDirectory,
...languageArgs
]}`
this.stopRun()
return new TranscriptFile({
language: language || await this.getDetectedLanguage(mediaFilePath),
path: this.getTranscriptFilePath(mediaFilePath, format),
format
})
}
}

View File

@ -0,0 +1,3 @@
export * from './ctranslate2-transcriber.js'
export * from './openai-transcriber.js'
export * from './timestamped-transcriber.js'

View File

@ -0,0 +1,62 @@
import { join } from 'path'
import { $ } from 'execa'
import { buildSUUID } from '@peertube/peertube-node-utils'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js'
import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
import { TranscriptionModel } from '../../transcription-model.js'
import { readFile } from 'node:fs/promises'
import { parse } from 'node:path'
export type WhisperTranscribeArgs = Omit<TranscribeArgs, 'model'> & { model?: TranscriptionModel }
export class OpenaiTranscriber extends AbstractTranscriber {
async transcribe ({
mediaFilePath,
model = new WhisperBuiltinModel('tiny'),
language,
format = 'vtt',
runId = buildSUUID()
}: WhisperTranscribeArgs): Promise<TranscriptFile> {
this.assertLanguageDetectionAvailable(language)
const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
const languageArgs = language ? [ '--language', language ] : []
this.createRun(runId)
this.startRun()
await $$`${this.engine.binary} ${[
mediaFilePath,
'--word_timestamps',
'True',
'--model',
model?.path || model.name,
'--output_format',
'all',
'--output_dir',
this.transcriptDirectory,
...languageArgs
]}`
this.stopRun()
return new TranscriptFile({
language: language || await this.getDetectedLanguage(mediaFilePath),
path: this.getTranscriptFilePath(mediaFilePath, format),
format
})
}
async getDetectedLanguage (mediaFilePath: string) {
const { language } = await this.readJsonTranscriptFile(mediaFilePath)
return language
}
async readJsonTranscriptFile (mediaFilePath: string) {
return JSON.parse(await readFile(this.getTranscriptFilePath(mediaFilePath, 'json'), 'utf8'))
}
getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat) {
return join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
}
}

View File

@ -0,0 +1,55 @@
import { $ } from 'execa'
import { buildSUUID } from '@peertube/peertube-node-utils'
import assert from 'node:assert'
import { join, parse } from 'node:path'
import { existsSync } from 'node:fs'
import { rename } from 'node:fs/promises'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
async transcribe ({
mediaFilePath,
model = new WhisperBuiltinModel('tiny'),
language,
format = 'vtt',
runId = buildSUUID()
}: WhisperTranscribeArgs): Promise<TranscriptFile> {
this.assertLanguageDetectionAvailable(language)
const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
const languageArgs = language ? [ '--language', language ] : []
this.createRun(runId)
this.startRun()
await $$`${this.engine.binary} ${[
mediaFilePath,
'--model',
model?.path || model.name,
'--output_format',
'all',
'--output_dir',
this.transcriptDirectory,
...languageArgs
]}`
this.stopRun()
const internalTranscriptPath = this.getTranscriptFilePath(mediaFilePath, format, false)
const transcriptPath = join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
// Whisper timestamped output files with the video file extension by defaults, ex: video.mp4.vtt
// @see https://github.com/linto-ai/whisper-timestamped/issues/189
assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`)
await rename(internalTranscriptPath, transcriptPath)
// communiquer-lors-dune-classe-transplantee.mp4.words.json
return new TranscriptFile({
language: language || await this.getDetectedLanguage(mediaFilePath),
path: transcriptPath,
format
})
}
getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat, words = true) {
return join(this.transcriptDirectory, `${parse(mediaFilePath).base}${words ? '.words' : ''}.${format}`)
}
}

View File

@ -0,0 +1,11 @@
import { TranscriptionModel } from '../transcription-model.js'
export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3'
export class WhisperBuiltinModel extends TranscriptionModel {
// eslint-disable-next-line @typescript-eslint/no-useless-constructor
constructor (name: WhisperBuiltinModelName) {
super(name)
}
}

View File

@ -0,0 +1,15 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"outDir": "./dist",
"rootDir": "src",
"tsBuildInfoFile": "./dist/.tsbuildinfo"
},
"references": [
{ "path": "../models" },
{ "path": "../core-utils" },
{ "path": "../node-utils" },
{ "path": "../jiwer" },
{ "path": "../server-commands" }
]
}

View File

@ -0,0 +1,10 @@
{
"extends": "./tsconfig.json",
"compilerOptions": {
"outDir": "../types-generator/dist/peertube-transcription",
"tsBuildInfoFile": "../types-generator/dist/peertube-transcription/.tsbuildinfo",
"stripInternal": true,
"removeComments": false,
"emitDeclarationOnly": true
}
}

View File

@ -146,4 +146,13 @@ elif [ "$1" = "lint" ]; then
npm run swagger-cli -- validate support/doc/api/openapi.yaml
( cd client && npm run lint )
elif [ "$1" = "transcription" ]; then
npm run preinstall --workspace=@peertube/peertube-transcription --workspace=@peertube/peertube-jiwer
npm run build:server
npm run build:tests
transcriptionFiles=$(findTestFiles ./packages/tests/dist/transcription)
jiwerFiles=$(findTestFiles ./packages/tests/dist/jiwer)
MOCHA_PARALLEL=true runJSTest "$1" $((3*$speedFactor)) $transcriptionFiles $jiwerFiles
fi

View File

@ -14,6 +14,7 @@
{ "path": "../packages/ffmpeg" },
{ "path": "../packages/models" },
{ "path": "../packages/node-utils" },
{ "path": "../packages/transcription" },
{ "path": "../packages/typescript-utils" }
],
"include": [

View File

@ -24,9 +24,11 @@
{ "path": "./apps/peertube-cli" },
{ "path": "./packages/core-utils" },
{ "path": "./packages/ffmpeg" },
{ "path": "./packages/jiwer" },
{ "path": "./packages/models" },
{ "path": "./packages/node-utils" },
{ "path": "./packages/server-commands" },
{ "path": "./packages/transcription" },
{ "path": "./packages/typescript-utils" }
]
}