1
0
Files
Toak/Program.cs

384 lines
15 KiB
C#

using System.Diagnostics;
using Toak.Audio;
using Toak.Configuration;
using Toak.Api;
using Toak.Core;
using Toak.IO;
bool pipeToStdout = args.Contains("--pipe") || args.Contains("-p");
bool copyToClipboard = args.Contains("--copy");
bool verbose = args.Contains("-v") || args.Contains("--verbose");
Logger.Verbose = verbose;
string command = "";
if (args.Length > 0 && !args[0].StartsWith("-"))
{
command = args[0];
}
if (args.Contains("-h") || args.Contains("--help") || (string.IsNullOrEmpty(command) && args.Length == 0))
{
Console.WriteLine("Toak: High-speed Linux Dictation");
Console.WriteLine("Usage:");
Console.WriteLine(" toak toggle - Starts or stops the recording");
Console.WriteLine(" toak discard - Abort current recording without transcribing");
Console.WriteLine(" toak onboard - Configure the application");
Console.WriteLine(" toak latency-test - Benchmark full pipeline without recording");
Console.WriteLine(" toak config <key> <value> - Update a specific configuration setting");
Console.WriteLine(" toak show - Show current configuration");
Console.WriteLine("Flags:");
Console.WriteLine(" -h, --help - Show this help message");
Console.WriteLine(" -p, --pipe - Output transcription to stdout instead of typing");
Console.WriteLine(" --copy - Copy to clipboard instead of typing");
Console.WriteLine(" -v, --verbose - Enable detailed debug logging");
return;
}
if (string.IsNullOrEmpty(command))
{
Console.WriteLine("Error: Please specify a command (e.g. 'toggle'). Use 'toak --help' for usage.");
return;
}
if (command == "onboard")
{
var config = ConfigManager.LoadConfig();
Console.Write($"Groq API Key [{config.GroqApiKey}]: ");
var key = Console.ReadLine();
if (!string.IsNullOrWhiteSpace(key)) config.GroqApiKey = key;
Console.WriteLine();
Console.WriteLine("LLM Model:");
Console.WriteLine(" 1) openai/gpt-oss-20b -- fastest");
Console.WriteLine(" 2) llama-3.1-8b-instant -- cheapest, but dumb");
Console.Write($"Select 1 or 2 [{config.LlmModel}]: ");
var llmSelection = Console.ReadLine()?.Trim();
if (llmSelection == "1" || llmSelection == "openai/gpt-oss-20b") config.LlmModel = "openai/gpt-oss-20b";
else if (llmSelection == "2" || llmSelection == "llama-3.1-8b-instant") config.LlmModel = "llama-3.1-8b-instant";
Console.WriteLine();
Console.WriteLine("Whisper Model:");
Console.WriteLine(" 1) whisper-large-v3 -- large model, very accurate");
Console.WriteLine(" 2) whisper-large-v3-turbo -- very fast, a bit less accurate");
Console.Write($"Select 1 or 2 [{config.WhisperModel}]: ");
var whisperSelection = Console.ReadLine()?.Trim();
if (whisperSelection == "1" || whisperSelection == "whisper-large-v3") config.WhisperModel = "whisper-large-v3";
else if (whisperSelection == "2" || whisperSelection == "whisper-large-v3-turbo") config.WhisperModel = "whisper-large-v3-turbo";
Console.WriteLine();
Console.Write($"Microphone Spoken Language (e.g. en, es, zh) [{config.WhisperLanguage}]: ");
var lang = Console.ReadLine();
if (!string.IsNullOrWhiteSpace(lang)) config.WhisperLanguage = lang.ToLowerInvariant();
Console.Write($"Typing Backend (xdotool or wtype) [{config.TypingBackend}]: ");
var backend = Console.ReadLine();
if (!string.IsNullOrWhiteSpace(backend)) config.TypingBackend = backend.ToLowerInvariant();
Console.WriteLine();
var availableSkills = Toak.Core.Skills.SkillRegistry.AllSkills.Select(s => s.Name);
Console.WriteLine($"Active Skills (comma separated) [{string.Join(", ", config.ActiveSkills)}]:");
Console.WriteLine($" Available: {string.Join(", ", availableSkills)}");
Console.Write("Selection: ");
var skillsInput = Console.ReadLine();
if (!string.IsNullOrWhiteSpace(skillsInput))
{
config.ActiveSkills = skillsInput.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
}
ConfigManager.SaveConfig(config);
Console.WriteLine("Configuration saved.");
return;
}
if (command == "show")
{
var config = ConfigManager.LoadConfig();
Console.WriteLine("Current Configuration:");
Console.WriteLine($" Groq API Key: {(string.IsNullOrEmpty(config.GroqApiKey) ? "Not Set" : "Set")}");
Console.WriteLine($" LLM Model: {config.LlmModel}");
Console.WriteLine($" Whisper Model: {config.WhisperModel}");
Console.WriteLine($" Spoken Language: {(string.IsNullOrEmpty(config.WhisperLanguage) ? "Auto" : config.WhisperLanguage)}");
Console.WriteLine($" Typing Backend: {config.TypingBackend}");
Console.WriteLine($" Active Skills: {string.Join(", ", config.ActiveSkills)}");
Console.WriteLine($" Style Mode: {config.StyleMode}");
Console.WriteLine($" Punctuation Module: {config.ModulePunctuation}");
Console.WriteLine($" Technical Sanitization: {config.ModuleTechnicalSanitization}");
Console.WriteLine($" Bullet Points: {config.StructureBulletPoints}");
Console.WriteLine($" Smart Paragraphing: {config.StructureSmartParagraphing}");
return;
}
if (command == "config")
{
var argsNoFlags = args.Where(a => !a.StartsWith("--")).ToArray();
if (argsNoFlags.Length < 3)
{
Console.WriteLine("Usage: toak config <key> <value>");
Console.WriteLine("Keys: llm, whisper, style, language, backend, punctuation, tech, bullets, paragraphs");
return;
}
var key = argsNoFlags[1].ToLowerInvariant();
var val = argsNoFlags[2].ToLowerInvariant();
var config = ConfigManager.LoadConfig();
switch (key)
{
case "llm":
config.LlmModel = val;
Console.WriteLine($"LLM Model set to {val}");
break;
case "whisper":
config.WhisperModel = val;
Console.WriteLine($"Whisper Model set to {val}");
break;
case "style":
if (val == "professional" || val == "concise" || val == "casual") {
config.StyleMode = val;
Console.WriteLine($"StyleMode set to {val}");
} else {
Console.WriteLine("Invalid style. Use: professional, concise, casual");
}
break;
case "language":
case "lang":
config.WhisperLanguage = val;
Console.WriteLine($"Spoken Language set to {val}");
break;
case "backend":
config.TypingBackend = val;
Console.WriteLine($"TypingBackend set to {val}");
break;
case "punctuation":
if (bool.TryParse(val, out var p)) { config.ModulePunctuation = p; Console.WriteLine($"Punctuation set to {p}"); }
else Console.WriteLine("Invalid value. Use true or false.");
break;
case "tech":
if (bool.TryParse(val, out var t)) { config.ModuleTechnicalSanitization = t; Console.WriteLine($"TechnicalSanitization set to {t}"); }
else Console.WriteLine("Invalid value. Use true or false.");
break;
case "bullets":
if (bool.TryParse(val, out var b)) { config.StructureBulletPoints = b; Console.WriteLine($"BulletPoints set to {b}"); }
else Console.WriteLine("Invalid value. Use true or false.");
break;
case "paragraphs":
if (bool.TryParse(val, out var sp)) { config.StructureSmartParagraphing = sp; Console.WriteLine($"SmartParagraphing set to {sp}"); }
else Console.WriteLine("Invalid value. Use true or false.");
break;
default:
Console.WriteLine($"Unknown config key: {key}");
return;
}
ConfigManager.SaveConfig(config);
return;
}
if (command == "discard")
{
if (StateTracker.IsRecording())
{
AudioRecorder.StopRecording();
var wavPath = AudioRecorder.GetWavPath();
if (File.Exists(wavPath)) File.Delete(wavPath);
Notifications.Notify("Toak", "Recording discarded");
if (!pipeToStdout) Console.WriteLine("Recording discarded.");
}
else
{
if (!pipeToStdout) Console.WriteLine("No active recording to discard.");
}
return;
}
if (command == "latency-test")
{
var config = ConfigManager.LoadConfig();
if (string.IsNullOrWhiteSpace(config.GroqApiKey))
{
Console.WriteLine("Groq API Key is not configured. Run 'toak onboard'.");
return;
}
Console.WriteLine("Generating 1-second silent audio file for testing...");
var testWavPath = Path.Combine(Path.GetTempPath(), "toak_latency_test.wav");
var pInfo = new ProcessStartInfo
{
FileName = "ffmpeg",
Arguments = $"-f lavfi -i anullsrc=r=44100:cl=mono -t 1 -y {testWavPath}",
UseShellExecute = false,
CreateNoWindow = true,
RedirectStandardError = true,
RedirectStandardOutput = true
};
var proc = Process.Start(pInfo);
proc?.WaitForExit();
if (!File.Exists(testWavPath))
{
Console.WriteLine("Failed to generate test audio file using ffmpeg.");
return;
}
var groq = new GroqApiClient(config.GroqApiKey);
try
{
Console.WriteLine("Testing STT (Whisper)...");
var sttWatch = Stopwatch.StartNew();
var transcript = await groq.TranscribeAsync(testWavPath, config.WhisperLanguage, config.WhisperModel);
sttWatch.Stop();
Console.WriteLine("Testing LLM (Llama)...");
var systemPrompt = PromptBuilder.BuildPrompt(config);
var llmWatch = Stopwatch.StartNew();
var refinedText = await groq.RefineTextAsync("Hello world, this is a latency test.", systemPrompt, config.LlmModel);
llmWatch.Stop();
var total = sttWatch.ElapsedMilliseconds + llmWatch.ElapsedMilliseconds;
Console.WriteLine();
Console.WriteLine($"STT latency: {sttWatch.ElapsedMilliseconds}ms");
Console.WriteLine($"LLM latency: {llmWatch.ElapsedMilliseconds}ms");
Console.WriteLine($"Total: {(total / 1000.0):0.0}s ({total}ms)");
Console.WriteLine($"Status: {(total < 1500 ? "OK (under 1.5s target)" : "SLOW (over 1.5s target)")}");
}
catch (Exception ex)
{
Console.WriteLine($"Error during test: {ex.Message}");
}
finally
{
if (File.Exists(testWavPath)) File.Delete(testWavPath);
}
return;
}
if (command == "toggle")
{
if (StateTracker.IsRecording())
{
var config = ConfigManager.LoadConfig();
Notifications.PlaySound(config.StopSoundPath);
if (!pipeToStdout) Console.WriteLine("Stopping recording and transcribing...");
if (!pipeToStdout) Notifications.Notify("Toak", "Transcribing...");
AudioRecorder.StopRecording();
Logger.LogDebug($"Loaded configuration: LLM={config.LlmModel}, Whisper={config.WhisperModel}, Typing={config.TypingBackend}");
if (string.IsNullOrWhiteSpace(config.GroqApiKey))
{
Notifications.Notify("Toak Error", "Groq API Key is not configured. Run 'toak onboard'.");
return;
}
var groq = new GroqApiClient(config.GroqApiKey);
var wavPath = AudioRecorder.GetWavPath();
if (!File.Exists(wavPath) || new FileInfo(wavPath).Length == 0)
{
if (!pipeToStdout) Notifications.Notify("Toak", "No audio recorded.");
return;
}
try
{
var stopWatch = Stopwatch.StartNew();
// 1. STT
Logger.LogDebug($"Starting STT transcription via Whisper for {wavPath}...");
var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage, config.WhisperModel);
Logger.LogDebug($"Raw transcript received: '{transcript}'");
if (string.IsNullOrWhiteSpace(transcript))
{
if (!pipeToStdout) Notifications.Notify("Toak", "No speech detected.");
return;
}
// 2. LLM Refinement
var detectedSkill = Toak.Core.Skills.SkillRegistry.DetectSkill(transcript, config.ActiveSkills);
string systemPrompt;
if (detectedSkill != null)
{
Logger.LogDebug($"Skill detected: {detectedSkill.Name}");
if (!pipeToStdout) Notifications.Notify("Toak Skill Detected", detectedSkill.Name);
systemPrompt = detectedSkill.GetSystemPrompt(transcript);
}
else
{
systemPrompt = PromptBuilder.BuildPrompt(config);
}
// 3. Output
if (detectedSkill != null && detectedSkill.HandlesExecution)
{
Logger.LogDebug($"Executing skill synchronously: {detectedSkill.Name}");
string finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
Logger.LogDebug($"Skill refined text: '{finalText}'");
if (string.IsNullOrWhiteSpace(finalText))
{
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
return;
}
detectedSkill.Execute(finalText);
stopWatch.Stop();
Notifications.Notify("Toak", $"Skill executed in {stopWatch.ElapsedMilliseconds}ms");
}
else if (pipeToStdout || copyToClipboard)
{
Logger.LogDebug("Starting LLM text refinement (synchronous)...");
string finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
Logger.LogDebug($"Refined text received: '{finalText}'");
if (string.IsNullOrWhiteSpace(finalText))
{
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
return;
}
if (pipeToStdout)
{
Console.WriteLine(finalText);
}
else
{
ClipboardManager.Copy(finalText);
stopWatch.Stop();
Notifications.Notify("Toak", $"Copied to clipboard in {stopWatch.ElapsedMilliseconds}ms");
}
}
else
{
Logger.LogDebug("Starting LLM text refinement (streaming)...");
var tokenStream = groq.RefineTextStreamAsync(transcript, systemPrompt, config.LlmModel);
Logger.LogDebug("Starting to inject text...");
await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend);
stopWatch.Stop();
Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms");
}
}
catch (Exception ex)
{
if (!pipeToStdout) Notifications.Notify("Toak Error", ex.Message);
if (!pipeToStdout) Console.WriteLine(ex.ToString());
}
finally
{
if (File.Exists(wavPath)) File.Delete(wavPath);
}
}
else
{
// Start recording
if (!pipeToStdout) Console.WriteLine("Starting recording...");
var config = ConfigManager.LoadConfig();
Notifications.PlaySound(config.StartSoundPath);
AudioRecorder.StartRecording();
}
}