From 82516810359d1a27835c6fa3f170aadc6de50d4b Mon Sep 17 00:00:00 2001 From: Michael Mainguy Date: Fri, 11 Aug 2023 10:54:10 -0500 Subject: [PATCH] Added Initial Voice Command Handler. --- netlify/functions/voice/voice.ts | 3 +- src/app.ts | 44 ++++++++++-- src/integration/voiceManager.ts | 106 ++++++++++++++++++++++++----- src/integration/voiceTranscript.ts | 11 +++ vite.config.ts | 7 +- 5 files changed, 146 insertions(+), 25 deletions(-) create mode 100644 src/integration/voiceTranscript.ts diff --git a/netlify/functions/voice/voice.ts b/netlify/functions/voice/voice.ts index e9754a2..a9dae37 100644 --- a/netlify/functions/voice/voice.ts +++ b/netlify/functions/voice/voice.ts @@ -12,8 +12,7 @@ export const handler: Handler = async (event: HandlerEvent, context: HandlerCont headers: {'Content-Type': 'application/json'}, statusCode: 200, body: JSON.stringify(data) - }; - + } } catch (error) { return { statusCode: 500, diff --git a/src/app.ts b/src/app.ts index 2175fdd..23051bd 100644 --- a/src/app.ts +++ b/src/app.ts @@ -18,6 +18,8 @@ import {InputTextView} from "./information/inputTextView"; import {GamepadManager} from "./controllers/gamepadManager"; import {CustomEnvironment} from "./util/customEnvironment"; import {DrawioManager} from "./integration/drawioManager"; +import {VoiceManager} from "./integration/voiceManager"; +import {TranscriptType} from "./integration/voiceTranscript"; export class App { @@ -32,10 +34,10 @@ export class App { //log.getLogger('IndexdbPersistenceManager').setLevel('info'); //log.getLogger('DiagramManager').setLevel('info'); //log.getLogger('DiagramConnection').setLevel('debug'); - log.getLogger('DrawioManager').setLevel('debug'); - - log.getLogger('EntityTree').setLevel('debug'); - log.getLogger('EditMenu').setLevel('debug'); + log.getLogger('DrawioManager').setLevel('warn'); + log.getLogger('VoiceManager').setLevel('debug'); + log.getLogger('EntityTree').setLevel('warn'); + log.getLogger('EditMenu').setLevel('warn'); const canvas = document.createElement("canvas"); canvas.style.width = "100%"; canvas.style.height = "100%"; @@ -53,6 +55,7 @@ export class App { const engine = new Engine(canvas, true); const scene = new Scene(engine); const environment = new CustomEnvironment(scene); + const query = Object.fromEntries(new URLSearchParams(window.location.search)); logger.debug('Query', query); if (query.shareCode) { @@ -119,8 +122,37 @@ export class App { const gamepadManager = new GamepadManager(scene); + const voiceManager = new VoiceManager(); + voiceManager.transcriptionObserver.add((text) => { + logger.info('Transcription', text); + switch (text.type) { + case TranscriptType.PartialTranscript: + if (text.words.length > 0 && + text.words[0].text.toLowerCase() == 'meta') { + logger.info('Meta command', text.text); + } + break; + case TranscriptType.FinalTranscript: + logger.info('Final', text.words[0].text.toLowerCase().substring(0, 4)); + if (text.words.length > 0 && + text.words[0].text.toLowerCase().substring(0, 4) == 'meta' && + text.words[0].confidence > .8) { + logger.info('Meta Final command', + text.words.map((e) => { + return e.text + }).slice(1).join(' ')); + } + + } + + }); window.addEventListener("keydown", (ev) => { - // Shift+Ctrl+Alt+I + if (ev.key == "z") { + voiceManager.startRecording(); + } + if (ev.key == "x") { + voiceManager.stopRecording(); + } if (ev.shiftKey && ev.ctrlKey && ev.altKey && ev.keyCode === 73) { import("@babylonjs/core/Debug/debugLayer").then(() => { import("@babylonjs/inspector").then(() => { @@ -134,6 +166,7 @@ export class App { } }); + logger.info('keydown event listener added, use Ctrl+Shift+Alt+I to toggle debug layer'); engine.runRenderLoop(() => { scene.render(); @@ -141,6 +174,7 @@ export class App { logger.info('Render loop started'); } } + const app = new App(); diff --git a/src/integration/voiceManager.ts b/src/integration/voiceManager.ts index 2525a11..6de8230 100644 --- a/src/integration/voiceManager.ts +++ b/src/integration/voiceManager.ts @@ -1,53 +1,76 @@ import RecordRTC from 'recordrtc'; +import log from "loglevel"; +import {Observable} from "@babylonjs/core"; +import {TranscriptType, VoiceTranscript} from "./voiceTranscript"; + +type VoiceManagerEvent = { + audio_start?: number; + audio_end?: number; + confidence?: number; + text?: string; + words?: Array; + created?: string; + message_type?: string +} export class VoiceManager { private socket: WebSocket; private token: string; + public readonly transcriptionObserver: Observable = new Observable(); private recorder: RecordRTC; private data: any[] = []; + private logger = log.getLogger('VoiceManager'); constructor() { - + this.setupRecorder(); } - public async setupConnection() { - const response = await fetch('/api/voice/token'); + public startRecording() { + this.connectToVoice(); + } + + public stopRecording() { + this.recorder.reset(); + this.socket.send('{"terminate_session": true}'); + this.socket = null; + } + + public async connectToVoice() { + const response = await fetch('/.netlify/functions/voice'); const data = await response.json(); this.token = data.token; if (!this.socket) { this.socket = new WebSocket(`wss://api.assemblyai.com/v2/realtime/ws?sample_rate=16000&token=${this.token}`); - this.socket.onmessage = (message) => { - const res = JSON.parse(message.data); - if (this.data) { - this.data.push(res); - //this.target.emit('transcriptiondata', {data: res}); - } - } + this.socket.onmessage = this.messageRecieved; this.socket.onopen = this.socketOpen; + this.socket.onclose = this.socketClose; } else { switch (this.socket.readyState) { case 0: - console.log('socket opening'); + this.logger.debug('socket opening'); break; case 1: - console.log('socket already open'); + this.logger.debug('socket already open'); + //await this.recorder.startRecording(); break; case 2: - console.log('dang, socket is closing'); + this.logger.debug('socket is closing'); this.socket = null; + //await this.setupConnection(); break; case 3: - console.log('Socket is closed'); + this.logger.debug('Socket is closed'); this.socket = null; + //await this.setupConnection(); break default: - console.log(`socket state is unknown: ${this.socket.readyState}`); + this.logger.debug(`socket state is unknown: ${this.socket.readyState}`); } } } - private async socketOpen() { + private async setupRecorder() { if (!this.recorder) { const stream = await navigator.mediaDevices.getUserMedia({audio: true}); this.recorder = new RecordRTC(stream, { @@ -62,7 +85,7 @@ export class VoiceManager { if (this.socket && (this.socket.readyState === 1)) { this.socket.send(JSON.stringify({audio_data: base64data.split('base64,')[1]})); } else { - console.log('no socket available'); + this.logger.warn('no socket available'); } }; reader.readAsDataURL(blob); @@ -70,4 +93,53 @@ export class VoiceManager { }); } } + + private messageRecieved = (message: any) => { + const res = (JSON.parse(message.data) as VoiceManagerEvent); + if (this.data) { + //this.logger.debug(`Received data: ${JSON.stringify(res)}`); + switch (res.message_type) { + case 'PartialTranscript': + if (res.words.length > 0) { + this.logger.debug(`PartialTranscript: ${res.text}`); + this.transcriptionObserver.notifyObservers( + { + text: res.text, words: res.words, confidence: res.confidence, + type: TranscriptType.PartialTranscript + }); + } + + break; + case 'FinalTranscript': + if (res.words.length > 0) { + this.transcriptionObserver.notifyObservers( + { + text: res.text, words: res.words, confidence: res.confidence, + type: TranscriptType.FinalTranscript + }); + } + + break; + case 'SessionBegins': + this.logger.debug(`SessionBegins: ${res}`); + break; + } + } + } + + private socketClose = async () => { + this.logger.debug('Socket closed'); + this.socket = null; + this.recorder.reset(); + } + + private socketOpen = async () => { + this.logger.debug('voice socket opened'); + if (!this.recorder) { + this.logger.error('recorder not initialized'); + } else { + this.recorder.startRecording(); + } + + } } \ No newline at end of file diff --git a/src/integration/voiceTranscript.ts b/src/integration/voiceTranscript.ts new file mode 100644 index 0000000..68c549f --- /dev/null +++ b/src/integration/voiceTranscript.ts @@ -0,0 +1,11 @@ +export type VoiceTranscript = { + words: VoiceTranscript[]; + text: string; + type: TranscriptType; + confidence: number; +} + +export enum TranscriptType { + PartialTranscript = 'PartialTranscript', + FinalTranscript = 'FinalTranscript' +} \ No newline at end of file diff --git a/vite.config.ts b/vite.config.ts index 13b7f71..d9ec451 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -3,7 +3,12 @@ import {defineConfig} from "vite"; /** @type {import('vite').UserConfig} */ export default defineConfig({ server: { - port: 3001, + port: 3001, + proxy: { + '/.netlify': { + target: 'http://localhost:9999/', + } + } }, base: "/"