feat: Add configurable LLM and Whisper models with user selection and API integration.

2026-02-25 22:05:32 +01:00
parent 863063f124
commit c2f4cbbfb2
3 changed files with 40 additions and 9 deletions
--- a/ConfigManager.cs
+++ b/ConfigManager.cs
@@ -14,6 +14,8 @@ public class ToakConfig
    public bool StructureSmartParagraphing { get; set; } = true;
    public string TargetLanguage { get; set; } = string.Empty;
    public string WhisperLanguage { get; set; } = string.Empty;
+    public string LlmModel { get; set; } = "openai/gpt-oss-20b";
+    public string WhisperModel { get; set; } = "whisper-large-v3-turbo";
 }

 public static class ConfigManager
--- a/GroqApiClient.cs
+++ b/GroqApiClient.cs
@@ -51,7 +51,7 @@ public class GroqApiClient
        _httpClient.BaseAddress = new Uri("https://api.groq.com/openai/v1/");
    }

-    public async Task<string> TranscribeAsync(string filePath, string language = "")
+    public async Task<string> TranscribeAsync(string filePath, string language = "", string model = "whisper-large-v3-turbo")
    {
        using var content = new MultipartFormDataContent();
        using var fileStream = File.OpenRead(filePath);
@@ -60,7 +60,7 @@ public class GroqApiClient
        streamContent.Headers.ContentType = new MediaTypeHeaderValue("audio/wav"); // or mpeg
        content.Add(streamContent, "file", Path.GetFileName(filePath));
        
-        string modelToUse = "whisper-large-v3-turbo";
+        string modelToUse = string.IsNullOrWhiteSpace(model) ? "whisper-large-v3-turbo" : model;

        // according to docs whisper-large-v3-turbo requires the language to be provided if it is to be translated later potentially or if we need the most accurate behavior
        // Actually, if we want language param, we can pass it to either model
@@ -85,11 +85,11 @@ public class GroqApiClient
        return result?.Text ?? string.Empty;
    }

-    public async Task<string> RefineTextAsync(string rawTranscript, string systemPrompt)
+    public async Task<string> RefineTextAsync(string rawTranscript, string systemPrompt, string model = "openai/gpt-oss-20b")
    {
        var requestBody = new LlamaRequest
        {
-            Model = "openai/gpt-oss-20b",
+            Model = string.IsNullOrWhiteSpace(model) ? "openai/gpt-oss-20b" : model,
            Temperature = 0.0,
            Messages = new[]
            {
--- a/Program.cs
+++ b/Program.cs
@@ -44,6 +44,25 @@ if (command == "onboard")
    var key = Console.ReadLine();
    if (!string.IsNullOrWhiteSpace(key)) config.GroqApiKey = key;

+    Console.WriteLine();
+    Console.WriteLine("LLM Model:");
+    Console.WriteLine("  1) openai/gpt-oss-20b -- fastest");
+    Console.WriteLine("  2) llama-3.1-8b-instant -- cheapest, but dumb");
+    Console.Write($"Select 1 or 2 [{config.LlmModel}]: ");
+    var llmSelection = Console.ReadLine()?.Trim();
+    if (llmSelection == "1" || llmSelection == "openai/gpt-oss-20b") config.LlmModel = "openai/gpt-oss-20b";
+    else if (llmSelection == "2" || llmSelection == "llama-3.1-8b-instant") config.LlmModel = "llama-3.1-8b-instant";
+
+    Console.WriteLine();
+    Console.WriteLine("Whisper Model:");
+    Console.WriteLine("  1) whisper-large-v3 -- large model, very accurate");
+    Console.WriteLine("  2) whisper-large-v3-turbo -- very fast, a bit less accurate");
+    Console.Write($"Select 1 or 2 [{config.WhisperModel}]: ");
+    var whisperSelection = Console.ReadLine()?.Trim();
+    if (whisperSelection == "1" || whisperSelection == "whisper-large-v3") config.WhisperModel = "whisper-large-v3";
+    else if (whisperSelection == "2" || whisperSelection == "whisper-large-v3-turbo") config.WhisperModel = "whisper-large-v3-turbo";
+
+    Console.WriteLine();
    Console.Write($"Microphone Spoken Language (e.g. en, es, zh) [{config.WhisperLanguage}]: ");
    var lang = Console.ReadLine();
    if (!string.IsNullOrWhiteSpace(lang)) config.WhisperLanguage = lang.ToLowerInvariant();
@@ -62,6 +81,8 @@ if (command == "show")
    var config = ConfigManager.LoadConfig();
    Console.WriteLine("Current Configuration:");
    Console.WriteLine($"  Groq API Key: {(string.IsNullOrEmpty(config.GroqApiKey) ? "Not Set" : "Set")}");
+    Console.WriteLine($"  LLM Model: {config.LlmModel}");
+    Console.WriteLine($"  Whisper Model: {config.WhisperModel}");
    Console.WriteLine($"  Spoken Language: {(string.IsNullOrEmpty(config.WhisperLanguage) ? "Auto" : config.WhisperLanguage)}");
    Console.WriteLine($"  Typing Backend: {config.TypingBackend}");
    Console.WriteLine($"  Style Mode: {config.StyleMode}");
@@ -78,7 +99,7 @@ if (command == "config")
    if (argsNoFlags.Length < 3)
    {
        Console.WriteLine("Usage: toak config <key> <value>");
-        Console.WriteLine("Keys: style, backend, punctuation, tech, bullets, paragraphs");
+        Console.WriteLine("Keys: llm, whisper, style, language, backend, punctuation, tech, bullets, paragraphs");
        return;
    }
    
@@ -88,6 +109,14 @@ if (command == "config")
    
    switch (key)
    {
+        case "llm":
+            config.LlmModel = val;
+            Console.WriteLine($"LLM Model set to {val}");
+            break;
+        case "whisper":
+            config.WhisperModel = val;
+            Console.WriteLine($"Whisper Model set to {val}");
+            break;
        case "style":
            if (val == "professional" || val == "concise" || val == "casual") {
                config.StyleMode = val;
@@ -182,13 +211,13 @@ if (command == "latency-test")
    {
        Console.WriteLine("Testing STT (Whisper)...");
        var sttWatch = Stopwatch.StartNew();
-        var transcript = await groq.TranscribeAsync(testWavPath, config.WhisperLanguage);
+        var transcript = await groq.TranscribeAsync(testWavPath, config.WhisperLanguage, config.WhisperModel);
        sttWatch.Stop();
        
        Console.WriteLine("Testing LLM (Llama)...");
        var systemPrompt = PromptBuilder.BuildPrompt(config);
        var llmWatch = Stopwatch.StartNew();
-        var refinedText = await groq.RefineTextAsync("Hello world, this is a latency test.", systemPrompt);
+        var refinedText = await groq.RefineTextAsync("Hello world, this is a latency test.", systemPrompt, config.LlmModel);
        llmWatch.Stop();

        var total = sttWatch.ElapsedMilliseconds + llmWatch.ElapsedMilliseconds;
@@ -246,7 +275,7 @@ if (command == "toggle")
            var stopWatch = Stopwatch.StartNew();

            // 1. STT
-            var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage);
+            var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage, config.WhisperModel);
            if (string.IsNullOrWhiteSpace(transcript))
            {
                if (!pipeToStdout) Notifications.Notify("Toak", "Could not transcribe audio.");
@@ -259,7 +288,7 @@ if (command == "toggle")
            if (!rawOutput)
            {
                var systemPrompt = PromptBuilder.BuildPrompt(config);
-                finalText = await groq.RefineTextAsync(transcript, systemPrompt);
+                finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
            }

            // 3. Output