384 lines
15 KiB
C#
384 lines
15 KiB
C#
using System.Diagnostics;
|
|
using Toak.Audio;
|
|
using Toak.Configuration;
|
|
using Toak.Api;
|
|
using Toak.Core;
|
|
using Toak.IO;
|
|
|
|
bool pipeToStdout = args.Contains("--pipe") || args.Contains("-p");
|
|
bool copyToClipboard = args.Contains("--copy");
|
|
bool verbose = args.Contains("-v") || args.Contains("--verbose");
|
|
|
|
Logger.Verbose = verbose;
|
|
|
|
|
|
string command = "";
|
|
if (args.Length > 0 && !args[0].StartsWith("-"))
|
|
{
|
|
command = args[0];
|
|
}
|
|
|
|
if (args.Contains("-h") || args.Contains("--help") || (string.IsNullOrEmpty(command) && args.Length == 0))
|
|
{
|
|
Console.WriteLine("Toak: High-speed Linux Dictation");
|
|
Console.WriteLine("Usage:");
|
|
Console.WriteLine(" toak toggle - Starts or stops the recording");
|
|
Console.WriteLine(" toak discard - Abort current recording without transcribing");
|
|
Console.WriteLine(" toak onboard - Configure the application");
|
|
Console.WriteLine(" toak latency-test - Benchmark full pipeline without recording");
|
|
Console.WriteLine(" toak config <key> <value> - Update a specific configuration setting");
|
|
Console.WriteLine(" toak show - Show current configuration");
|
|
Console.WriteLine("Flags:");
|
|
Console.WriteLine(" -h, --help - Show this help message");
|
|
Console.WriteLine(" -p, --pipe - Output transcription to stdout instead of typing");
|
|
Console.WriteLine(" --copy - Copy to clipboard instead of typing");
|
|
Console.WriteLine(" -v, --verbose - Enable detailed debug logging");
|
|
return;
|
|
}
|
|
|
|
if (string.IsNullOrEmpty(command))
|
|
{
|
|
Console.WriteLine("Error: Please specify a command (e.g. 'toggle'). Use 'toak --help' for usage.");
|
|
return;
|
|
}
|
|
if (command == "onboard")
|
|
{
|
|
var config = ConfigManager.LoadConfig();
|
|
Console.Write($"Groq API Key [{config.GroqApiKey}]: ");
|
|
var key = Console.ReadLine();
|
|
if (!string.IsNullOrWhiteSpace(key)) config.GroqApiKey = key;
|
|
|
|
Console.WriteLine();
|
|
Console.WriteLine("LLM Model:");
|
|
Console.WriteLine(" 1) openai/gpt-oss-20b -- fastest");
|
|
Console.WriteLine(" 2) llama-3.1-8b-instant -- cheapest, but dumb");
|
|
Console.Write($"Select 1 or 2 [{config.LlmModel}]: ");
|
|
var llmSelection = Console.ReadLine()?.Trim();
|
|
if (llmSelection == "1" || llmSelection == "openai/gpt-oss-20b") config.LlmModel = "openai/gpt-oss-20b";
|
|
else if (llmSelection == "2" || llmSelection == "llama-3.1-8b-instant") config.LlmModel = "llama-3.1-8b-instant";
|
|
|
|
Console.WriteLine();
|
|
Console.WriteLine("Whisper Model:");
|
|
Console.WriteLine(" 1) whisper-large-v3 -- large model, very accurate");
|
|
Console.WriteLine(" 2) whisper-large-v3-turbo -- very fast, a bit less accurate");
|
|
Console.Write($"Select 1 or 2 [{config.WhisperModel}]: ");
|
|
var whisperSelection = Console.ReadLine()?.Trim();
|
|
if (whisperSelection == "1" || whisperSelection == "whisper-large-v3") config.WhisperModel = "whisper-large-v3";
|
|
else if (whisperSelection == "2" || whisperSelection == "whisper-large-v3-turbo") config.WhisperModel = "whisper-large-v3-turbo";
|
|
|
|
Console.WriteLine();
|
|
Console.Write($"Microphone Spoken Language (e.g. en, es, zh) [{config.WhisperLanguage}]: ");
|
|
var lang = Console.ReadLine();
|
|
if (!string.IsNullOrWhiteSpace(lang)) config.WhisperLanguage = lang.ToLowerInvariant();
|
|
|
|
Console.Write($"Typing Backend (xdotool or wtype) [{config.TypingBackend}]: ");
|
|
var backend = Console.ReadLine();
|
|
if (!string.IsNullOrWhiteSpace(backend)) config.TypingBackend = backend.ToLowerInvariant();
|
|
|
|
Console.WriteLine();
|
|
var availableSkills = Toak.Core.Skills.SkillRegistry.AllSkills.Select(s => s.Name);
|
|
Console.WriteLine($"Active Skills (comma separated) [{string.Join(", ", config.ActiveSkills)}]:");
|
|
Console.WriteLine($" Available: {string.Join(", ", availableSkills)}");
|
|
Console.Write("Selection: ");
|
|
var skillsInput = Console.ReadLine();
|
|
if (!string.IsNullOrWhiteSpace(skillsInput))
|
|
{
|
|
config.ActiveSkills = skillsInput.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
|
}
|
|
|
|
ConfigManager.SaveConfig(config);
|
|
Console.WriteLine("Configuration saved.");
|
|
return;
|
|
}
|
|
|
|
if (command == "show")
|
|
{
|
|
var config = ConfigManager.LoadConfig();
|
|
Console.WriteLine("Current Configuration:");
|
|
Console.WriteLine($" Groq API Key: {(string.IsNullOrEmpty(config.GroqApiKey) ? "Not Set" : "Set")}");
|
|
Console.WriteLine($" LLM Model: {config.LlmModel}");
|
|
Console.WriteLine($" Whisper Model: {config.WhisperModel}");
|
|
Console.WriteLine($" Spoken Language: {(string.IsNullOrEmpty(config.WhisperLanguage) ? "Auto" : config.WhisperLanguage)}");
|
|
Console.WriteLine($" Typing Backend: {config.TypingBackend}");
|
|
Console.WriteLine($" Active Skills: {string.Join(", ", config.ActiveSkills)}");
|
|
Console.WriteLine($" Style Mode: {config.StyleMode}");
|
|
Console.WriteLine($" Punctuation Module: {config.ModulePunctuation}");
|
|
Console.WriteLine($" Technical Sanitization: {config.ModuleTechnicalSanitization}");
|
|
Console.WriteLine($" Bullet Points: {config.StructureBulletPoints}");
|
|
Console.WriteLine($" Smart Paragraphing: {config.StructureSmartParagraphing}");
|
|
return;
|
|
}
|
|
|
|
if (command == "config")
|
|
{
|
|
var argsNoFlags = args.Where(a => !a.StartsWith("--")).ToArray();
|
|
if (argsNoFlags.Length < 3)
|
|
{
|
|
Console.WriteLine("Usage: toak config <key> <value>");
|
|
Console.WriteLine("Keys: llm, whisper, style, language, backend, punctuation, tech, bullets, paragraphs");
|
|
return;
|
|
}
|
|
|
|
var key = argsNoFlags[1].ToLowerInvariant();
|
|
var val = argsNoFlags[2].ToLowerInvariant();
|
|
var config = ConfigManager.LoadConfig();
|
|
|
|
switch (key)
|
|
{
|
|
case "llm":
|
|
config.LlmModel = val;
|
|
Console.WriteLine($"LLM Model set to {val}");
|
|
break;
|
|
case "whisper":
|
|
config.WhisperModel = val;
|
|
Console.WriteLine($"Whisper Model set to {val}");
|
|
break;
|
|
case "style":
|
|
if (val == "professional" || val == "concise" || val == "casual") {
|
|
config.StyleMode = val;
|
|
Console.WriteLine($"StyleMode set to {val}");
|
|
} else {
|
|
Console.WriteLine("Invalid style. Use: professional, concise, casual");
|
|
}
|
|
break;
|
|
case "language":
|
|
case "lang":
|
|
config.WhisperLanguage = val;
|
|
Console.WriteLine($"Spoken Language set to {val}");
|
|
break;
|
|
case "backend":
|
|
config.TypingBackend = val;
|
|
Console.WriteLine($"TypingBackend set to {val}");
|
|
break;
|
|
case "punctuation":
|
|
if (bool.TryParse(val, out var p)) { config.ModulePunctuation = p; Console.WriteLine($"Punctuation set to {p}"); }
|
|
else Console.WriteLine("Invalid value. Use true or false.");
|
|
break;
|
|
case "tech":
|
|
if (bool.TryParse(val, out var t)) { config.ModuleTechnicalSanitization = t; Console.WriteLine($"TechnicalSanitization set to {t}"); }
|
|
else Console.WriteLine("Invalid value. Use true or false.");
|
|
break;
|
|
case "bullets":
|
|
if (bool.TryParse(val, out var b)) { config.StructureBulletPoints = b; Console.WriteLine($"BulletPoints set to {b}"); }
|
|
else Console.WriteLine("Invalid value. Use true or false.");
|
|
break;
|
|
case "paragraphs":
|
|
if (bool.TryParse(val, out var sp)) { config.StructureSmartParagraphing = sp; Console.WriteLine($"SmartParagraphing set to {sp}"); }
|
|
else Console.WriteLine("Invalid value. Use true or false.");
|
|
break;
|
|
default:
|
|
Console.WriteLine($"Unknown config key: {key}");
|
|
return;
|
|
}
|
|
ConfigManager.SaveConfig(config);
|
|
return;
|
|
}
|
|
|
|
if (command == "discard")
|
|
{
|
|
if (StateTracker.IsRecording())
|
|
{
|
|
AudioRecorder.StopRecording();
|
|
var wavPath = AudioRecorder.GetWavPath();
|
|
if (File.Exists(wavPath)) File.Delete(wavPath);
|
|
Notifications.Notify("Toak", "Recording discarded");
|
|
if (!pipeToStdout) Console.WriteLine("Recording discarded.");
|
|
}
|
|
else
|
|
{
|
|
if (!pipeToStdout) Console.WriteLine("No active recording to discard.");
|
|
}
|
|
return;
|
|
}
|
|
|
|
if (command == "latency-test")
|
|
{
|
|
var config = ConfigManager.LoadConfig();
|
|
if (string.IsNullOrWhiteSpace(config.GroqApiKey))
|
|
{
|
|
Console.WriteLine("Groq API Key is not configured. Run 'toak onboard'.");
|
|
return;
|
|
}
|
|
|
|
Console.WriteLine("Generating 1-second silent audio file for testing...");
|
|
var testWavPath = Path.Combine(Path.GetTempPath(), "toak_latency_test.wav");
|
|
|
|
var pInfo = new ProcessStartInfo
|
|
{
|
|
FileName = "ffmpeg",
|
|
Arguments = $"-f lavfi -i anullsrc=r=44100:cl=mono -t 1 -y {testWavPath}",
|
|
UseShellExecute = false,
|
|
CreateNoWindow = true,
|
|
RedirectStandardError = true,
|
|
RedirectStandardOutput = true
|
|
};
|
|
var proc = Process.Start(pInfo);
|
|
proc?.WaitForExit();
|
|
|
|
if (!File.Exists(testWavPath))
|
|
{
|
|
Console.WriteLine("Failed to generate test audio file using ffmpeg.");
|
|
return;
|
|
}
|
|
|
|
var groq = new GroqApiClient(config.GroqApiKey);
|
|
|
|
try
|
|
{
|
|
Console.WriteLine("Testing STT (Whisper)...");
|
|
var sttWatch = Stopwatch.StartNew();
|
|
var transcript = await groq.TranscribeAsync(testWavPath, config.WhisperLanguage, config.WhisperModel);
|
|
sttWatch.Stop();
|
|
|
|
Console.WriteLine("Testing LLM (Llama)...");
|
|
var systemPrompt = PromptBuilder.BuildPrompt(config);
|
|
var llmWatch = Stopwatch.StartNew();
|
|
var refinedText = await groq.RefineTextAsync("Hello world, this is a latency test.", systemPrompt, config.LlmModel);
|
|
llmWatch.Stop();
|
|
|
|
var total = sttWatch.ElapsedMilliseconds + llmWatch.ElapsedMilliseconds;
|
|
|
|
Console.WriteLine();
|
|
Console.WriteLine($"STT latency: {sttWatch.ElapsedMilliseconds}ms");
|
|
Console.WriteLine($"LLM latency: {llmWatch.ElapsedMilliseconds}ms");
|
|
Console.WriteLine($"Total: {(total / 1000.0):0.0}s ({total}ms)");
|
|
Console.WriteLine($"Status: {(total < 1500 ? "OK (under 1.5s target)" : "SLOW (over 1.5s target)")}");
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
Console.WriteLine($"Error during test: {ex.Message}");
|
|
}
|
|
finally
|
|
{
|
|
if (File.Exists(testWavPath)) File.Delete(testWavPath);
|
|
}
|
|
|
|
return;
|
|
}
|
|
|
|
if (command == "toggle")
|
|
{
|
|
if (StateTracker.IsRecording())
|
|
{
|
|
var config = ConfigManager.LoadConfig();
|
|
Notifications.PlaySound(config.StopSoundPath);
|
|
|
|
if (!pipeToStdout) Console.WriteLine("Stopping recording and transcribing...");
|
|
if (!pipeToStdout) Notifications.Notify("Toak", "Transcribing...");
|
|
|
|
AudioRecorder.StopRecording();
|
|
|
|
Logger.LogDebug($"Loaded configuration: LLM={config.LlmModel}, Whisper={config.WhisperModel}, Typing={config.TypingBackend}");
|
|
|
|
if (string.IsNullOrWhiteSpace(config.GroqApiKey))
|
|
{
|
|
Notifications.Notify("Toak Error", "Groq API Key is not configured. Run 'toak onboard'.");
|
|
return;
|
|
}
|
|
|
|
var groq = new GroqApiClient(config.GroqApiKey);
|
|
var wavPath = AudioRecorder.GetWavPath();
|
|
|
|
if (!File.Exists(wavPath) || new FileInfo(wavPath).Length == 0)
|
|
{
|
|
if (!pipeToStdout) Notifications.Notify("Toak", "No audio recorded.");
|
|
return;
|
|
}
|
|
|
|
try
|
|
{
|
|
var stopWatch = Stopwatch.StartNew();
|
|
|
|
// 1. STT
|
|
Logger.LogDebug($"Starting STT transcription via Whisper for {wavPath}...");
|
|
var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage, config.WhisperModel);
|
|
Logger.LogDebug($"Raw transcript received: '{transcript}'");
|
|
|
|
if (string.IsNullOrWhiteSpace(transcript))
|
|
{
|
|
if (!pipeToStdout) Notifications.Notify("Toak", "No speech detected.");
|
|
return;
|
|
}
|
|
|
|
// 2. LLM Refinement
|
|
var detectedSkill = Toak.Core.Skills.SkillRegistry.DetectSkill(transcript, config.ActiveSkills);
|
|
string systemPrompt;
|
|
if (detectedSkill != null)
|
|
{
|
|
Logger.LogDebug($"Skill detected: {detectedSkill.Name}");
|
|
if (!pipeToStdout) Notifications.Notify("Toak Skill Detected", detectedSkill.Name);
|
|
systemPrompt = detectedSkill.GetSystemPrompt(transcript);
|
|
}
|
|
else
|
|
{
|
|
systemPrompt = PromptBuilder.BuildPrompt(config);
|
|
}
|
|
|
|
// 3. Output
|
|
if (detectedSkill != null && detectedSkill.HandlesExecution)
|
|
{
|
|
Logger.LogDebug($"Executing skill synchronously: {detectedSkill.Name}");
|
|
string finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
|
|
Logger.LogDebug($"Skill refined text: '{finalText}'");
|
|
if (string.IsNullOrWhiteSpace(finalText))
|
|
{
|
|
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
|
|
return;
|
|
}
|
|
|
|
detectedSkill.Execute(finalText);
|
|
stopWatch.Stop();
|
|
Notifications.Notify("Toak", $"Skill executed in {stopWatch.ElapsedMilliseconds}ms");
|
|
}
|
|
else if (pipeToStdout || copyToClipboard)
|
|
{
|
|
Logger.LogDebug("Starting LLM text refinement (synchronous)...");
|
|
string finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
|
|
Logger.LogDebug($"Refined text received: '{finalText}'");
|
|
if (string.IsNullOrWhiteSpace(finalText))
|
|
{
|
|
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
|
|
return;
|
|
}
|
|
|
|
if (pipeToStdout)
|
|
{
|
|
Console.WriteLine(finalText);
|
|
}
|
|
else
|
|
{
|
|
ClipboardManager.Copy(finalText);
|
|
stopWatch.Stop();
|
|
Notifications.Notify("Toak", $"Copied to clipboard in {stopWatch.ElapsedMilliseconds}ms");
|
|
}
|
|
}
|
|
else
|
|
{
|
|
Logger.LogDebug("Starting LLM text refinement (streaming)...");
|
|
var tokenStream = groq.RefineTextStreamAsync(transcript, systemPrompt, config.LlmModel);
|
|
Logger.LogDebug("Starting to inject text...");
|
|
await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend);
|
|
stopWatch.Stop();
|
|
Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms");
|
|
}
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
if (!pipeToStdout) Notifications.Notify("Toak Error", ex.Message);
|
|
if (!pipeToStdout) Console.WriteLine(ex.ToString());
|
|
}
|
|
finally
|
|
{
|
|
if (File.Exists(wavPath)) File.Delete(wavPath);
|
|
}
|
|
}
|
|
else
|
|
{
|
|
// Start recording
|
|
if (!pipeToStdout) Console.WriteLine("Starting recording...");
|
|
var config = ConfigManager.LoadConfig();
|
|
Notifications.PlaySound(config.StartSoundPath);
|
|
AudioRecorder.StartRecording();
|
|
}
|
|
}
|