From c2f4cbbfb2a107f11490d06f24eed7bd2f0aba52 Mon Sep 17 00:00:00 2001 From: TomiEckert Date: Wed, 25 Feb 2026 22:05:32 +0100 Subject: [PATCH] feat: Add configurable LLM and Whisper models with user selection and API integration. --- ConfigManager.cs | 2 ++ GroqApiClient.cs | 8 ++++---- Program.cs | 39 ++++++++++++++++++++++++++++++++++----- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/ConfigManager.cs b/ConfigManager.cs index 591176c..b1b914d 100644 --- a/ConfigManager.cs +++ b/ConfigManager.cs @@ -14,6 +14,8 @@ public class ToakConfig public bool StructureSmartParagraphing { get; set; } = true; public string TargetLanguage { get; set; } = string.Empty; public string WhisperLanguage { get; set; } = string.Empty; + public string LlmModel { get; set; } = "openai/gpt-oss-20b"; + public string WhisperModel { get; set; } = "whisper-large-v3-turbo"; } public static class ConfigManager diff --git a/GroqApiClient.cs b/GroqApiClient.cs index 558f8e4..01167ef 100644 --- a/GroqApiClient.cs +++ b/GroqApiClient.cs @@ -51,7 +51,7 @@ public class GroqApiClient _httpClient.BaseAddress = new Uri("https://api.groq.com/openai/v1/"); } - public async Task TranscribeAsync(string filePath, string language = "") + public async Task TranscribeAsync(string filePath, string language = "", string model = "whisper-large-v3-turbo") { using var content = new MultipartFormDataContent(); using var fileStream = File.OpenRead(filePath); @@ -60,7 +60,7 @@ public class GroqApiClient streamContent.Headers.ContentType = new MediaTypeHeaderValue("audio/wav"); // or mpeg content.Add(streamContent, "file", Path.GetFileName(filePath)); - string modelToUse = "whisper-large-v3-turbo"; + string modelToUse = string.IsNullOrWhiteSpace(model) ? "whisper-large-v3-turbo" : model; // according to docs whisper-large-v3-turbo requires the language to be provided if it is to be translated later potentially or if we need the most accurate behavior // Actually, if we want language param, we can pass it to either model @@ -85,11 +85,11 @@ public class GroqApiClient return result?.Text ?? string.Empty; } - public async Task RefineTextAsync(string rawTranscript, string systemPrompt) + public async Task RefineTextAsync(string rawTranscript, string systemPrompt, string model = "openai/gpt-oss-20b") { var requestBody = new LlamaRequest { - Model = "openai/gpt-oss-20b", + Model = string.IsNullOrWhiteSpace(model) ? "openai/gpt-oss-20b" : model, Temperature = 0.0, Messages = new[] { diff --git a/Program.cs b/Program.cs index 15b3708..06c3601 100644 --- a/Program.cs +++ b/Program.cs @@ -44,6 +44,25 @@ if (command == "onboard") var key = Console.ReadLine(); if (!string.IsNullOrWhiteSpace(key)) config.GroqApiKey = key; + Console.WriteLine(); + Console.WriteLine("LLM Model:"); + Console.WriteLine(" 1) openai/gpt-oss-20b -- fastest"); + Console.WriteLine(" 2) llama-3.1-8b-instant -- cheapest, but dumb"); + Console.Write($"Select 1 or 2 [{config.LlmModel}]: "); + var llmSelection = Console.ReadLine()?.Trim(); + if (llmSelection == "1" || llmSelection == "openai/gpt-oss-20b") config.LlmModel = "openai/gpt-oss-20b"; + else if (llmSelection == "2" || llmSelection == "llama-3.1-8b-instant") config.LlmModel = "llama-3.1-8b-instant"; + + Console.WriteLine(); + Console.WriteLine("Whisper Model:"); + Console.WriteLine(" 1) whisper-large-v3 -- large model, very accurate"); + Console.WriteLine(" 2) whisper-large-v3-turbo -- very fast, a bit less accurate"); + Console.Write($"Select 1 or 2 [{config.WhisperModel}]: "); + var whisperSelection = Console.ReadLine()?.Trim(); + if (whisperSelection == "1" || whisperSelection == "whisper-large-v3") config.WhisperModel = "whisper-large-v3"; + else if (whisperSelection == "2" || whisperSelection == "whisper-large-v3-turbo") config.WhisperModel = "whisper-large-v3-turbo"; + + Console.WriteLine(); Console.Write($"Microphone Spoken Language (e.g. en, es, zh) [{config.WhisperLanguage}]: "); var lang = Console.ReadLine(); if (!string.IsNullOrWhiteSpace(lang)) config.WhisperLanguage = lang.ToLowerInvariant(); @@ -62,6 +81,8 @@ if (command == "show") var config = ConfigManager.LoadConfig(); Console.WriteLine("Current Configuration:"); Console.WriteLine($" Groq API Key: {(string.IsNullOrEmpty(config.GroqApiKey) ? "Not Set" : "Set")}"); + Console.WriteLine($" LLM Model: {config.LlmModel}"); + Console.WriteLine($" Whisper Model: {config.WhisperModel}"); Console.WriteLine($" Spoken Language: {(string.IsNullOrEmpty(config.WhisperLanguage) ? "Auto" : config.WhisperLanguage)}"); Console.WriteLine($" Typing Backend: {config.TypingBackend}"); Console.WriteLine($" Style Mode: {config.StyleMode}"); @@ -78,7 +99,7 @@ if (command == "config") if (argsNoFlags.Length < 3) { Console.WriteLine("Usage: toak config "); - Console.WriteLine("Keys: style, backend, punctuation, tech, bullets, paragraphs"); + Console.WriteLine("Keys: llm, whisper, style, language, backend, punctuation, tech, bullets, paragraphs"); return; } @@ -88,6 +109,14 @@ if (command == "config") switch (key) { + case "llm": + config.LlmModel = val; + Console.WriteLine($"LLM Model set to {val}"); + break; + case "whisper": + config.WhisperModel = val; + Console.WriteLine($"Whisper Model set to {val}"); + break; case "style": if (val == "professional" || val == "concise" || val == "casual") { config.StyleMode = val; @@ -182,13 +211,13 @@ if (command == "latency-test") { Console.WriteLine("Testing STT (Whisper)..."); var sttWatch = Stopwatch.StartNew(); - var transcript = await groq.TranscribeAsync(testWavPath, config.WhisperLanguage); + var transcript = await groq.TranscribeAsync(testWavPath, config.WhisperLanguage, config.WhisperModel); sttWatch.Stop(); Console.WriteLine("Testing LLM (Llama)..."); var systemPrompt = PromptBuilder.BuildPrompt(config); var llmWatch = Stopwatch.StartNew(); - var refinedText = await groq.RefineTextAsync("Hello world, this is a latency test.", systemPrompt); + var refinedText = await groq.RefineTextAsync("Hello world, this is a latency test.", systemPrompt, config.LlmModel); llmWatch.Stop(); var total = sttWatch.ElapsedMilliseconds + llmWatch.ElapsedMilliseconds; @@ -246,7 +275,7 @@ if (command == "toggle") var stopWatch = Stopwatch.StartNew(); // 1. STT - var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage); + var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage, config.WhisperModel); if (string.IsNullOrWhiteSpace(transcript)) { if (!pipeToStdout) Notifications.Notify("Toak", "Could not transcribe audio."); @@ -259,7 +288,7 @@ if (command == "toggle") if (!rawOutput) { var systemPrompt = PromptBuilder.BuildPrompt(config); - finalText = await groq.RefineTextAsync(transcript, systemPrompt); + finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel); } // 3. Output