using Hush.Audio; using Hush.Config; using Hush.Input; using Hush.Providers.Interfaces; using Hush.Providers.Providers; namespace Hush.Daemon; public class Orchestrator { private static readonly HttpClient _httpClient = new(); private readonly ConfigManager _configManager; private readonly IAudioRecorder _recorder; private string? _recordingPath; private DateTime? _recordingStartTime; private bool _isRecording; private readonly Lock _lock = new(); public Orchestrator(ConfigManager configManager) { _configManager = configManager; _recorder = CreateAudioRecorder(); } public bool IsRecording { get { lock (_lock) { return _isRecording && _recorder.IsRecording; } } } public TimeSpan? GetRecordingDuration() { lock (_lock) { if (!_isRecording || !_recordingStartTime.HasValue) return null; return DateTime.UtcNow - _recordingStartTime.Value; } } public Task StartRecordingAsync() { lock (_lock) { if (_isRecording) throw new InvalidOperationException("Recording is already in progress"); _recordingPath = Path.Combine(Path.GetTempPath(), $"hush_recording_{Guid.NewGuid()}.wav"); _recordingStartTime = DateTime.UtcNow; _isRecording = true; } return _recorder.StartRecording(_recordingPath); } public async Task StopAndProcessAsync(HushConfig? overrideConfig = null) { string? recordingPath; DateTime? recordingStartTime; lock (_lock) { if (!_isRecording) return; recordingPath = _recordingPath; recordingStartTime = _recordingStartTime; _isRecording = false; } await _recorder.StopRecording(); if (string.IsNullOrEmpty(recordingPath) || !File.Exists(recordingPath)) { SendNotification("Error", "Recording file not found"); return; } try { var config = overrideConfig ?? _configManager.Load(); var recordingDuration = recordingStartTime.HasValue ? DateTime.UtcNow - recordingStartTime.Value : TimeSpan.Zero; var minDuration = TimeSpan.FromMilliseconds(config.MinRecordingDuration); if (recordingDuration < minDuration) { SendNotification("Hush", "Recording too short, ignored"); File.Delete(recordingPath); return; } var transcription = await TranscribeAsync(recordingPath, config); var processedText = await ProcessWithLlmAsync(transcription, config); await TypeAsync(processedText, config); File.Delete(recordingPath); } catch (Exception ex) { SendNotification("Hush Error", ex.Message); } } public Task AbortAsync() { string? recordingPath; lock (_lock) { if (!_isRecording) return Task.CompletedTask; recordingPath = _recordingPath; _isRecording = false; } _ = _recorder.StopRecording(); if (!string.IsNullOrEmpty(recordingPath) && File.Exists(recordingPath)) { File.Delete(recordingPath); } return Task.CompletedTask; } private async Task TranscribeAsync(string path, HushConfig config) { var provider = GetAudioToTextProvider(config); await using var stream = File.OpenRead(path); return await provider.TranscribeAsync( stream, config.WhisperModel, language: string.IsNullOrEmpty(config.WhisperLanguage) ? null : config.WhisperLanguage); } private const string DefaultSystemPrompt = """ You are a transcription post-processor. Your task is to clean up raw speech-to-text output and return polished, ready-to-type text. Rules: - Detect the language of the transcription and process it entirely in that language — do not translate - Fix grammar, spelling, and punctuation errors introduced by the speech recognizer, following the conventions of the detected language - Capitalize sentences and proper nouns appropriately for the detected language - Remove filler words and false starts appropriate to the detected language (e.g. "um", "uh", "like" in English; "euh", "bah" in French; "äh", "ähm" in German; "eh", "tipo" in Spanish/Italian) - Preserve the speaker's original intent, vocabulary choices, and tone - Do not add, remove, or reinterpret content beyond what was said - Do not include any explanation, preamble, or metadata — output only the corrected text - If the input is empty or unintelligible, return an empty string """; private async Task ProcessWithLlmAsync(string text, HushConfig config) { var provider = GetTextProvider(config); var systemPrompt = string.IsNullOrWhiteSpace(config.SystemPrompt) ? DefaultSystemPrompt : config.SystemPrompt; return await provider.CompleteTextAsync(systemPrompt, text, config.LlmModel); } public async Task GenerateProfilePromptAsync(string description) { var config = _configManager.Load(); var provider = GetTextProvider(config); const string systemPrompt = """ You are a configuration assistant for Hush, a Linux speech-to-text post-processor. Hush records the user's voice, transcribes it with Whisper, then passes the transcription to an LLM using a system prompt you will write. Given the user's description of what they want the profile to do, write a precise, concise system prompt that instructs the LLM how to transform the raw transcription. Rules: - Output only the system prompt text, nothing else - Do not include meta-commentary, labels, or markdown formatting - The prompt must be self-contained and unambiguous - Always end with an instruction to output only the final result with no explanation """; return await provider.CompleteTextAsync(systemPrompt, description, config.LlmModel); } private async Task TypeAsync(string text, HushConfig config) { var input = GetTextInput(config); await input.TypeString(text); } private IAudioToTextProvider GetAudioToTextProvider(HushConfig config) => config.WhisperProvider switch { "groq" => string.IsNullOrEmpty(config.GroqApiKey) ? throw new InvalidOperationException("Groq API key is required for Whisper transcription") : new GroqProvider(config.GroqApiKey, _httpClient), "fireworks" => string.IsNullOrEmpty(config.FireworksApiKey) ? throw new InvalidOperationException("Fireworks API key is required for Whisper transcription") : new FireworksProvider(config.FireworksApiKey, _httpClient), _ => throw new InvalidOperationException($"Unsupported Whisper provider: {config.WhisperProvider}") }; private ITextStreamingProvider GetTextProvider(HushConfig config) => config.LlmProvider switch { "groq" => string.IsNullOrEmpty(config.GroqApiKey) ? throw new InvalidOperationException("Groq API key is required for LLM") : new GroqProvider(config.GroqApiKey, _httpClient), "fireworks" => string.IsNullOrEmpty(config.FireworksApiKey) ? throw new InvalidOperationException("Fireworks API key is required for LLM") : new FireworksProvider(config.FireworksApiKey, _httpClient), _ => throw new InvalidOperationException($"Unsupported LLM provider: {config.LlmProvider}") }; private static ITextInput GetTextInput(HushConfig config) => config.TypingBackend switch { "xdotool" => new XdotoolInput(), _ => new WtypeInput() }; private IAudioRecorder CreateAudioRecorder() { var config = _configManager.Load(); return config.AudioBackend switch { "ffmpeg" => new FfmpegAudioRecorder(), _ => new PipewireAudioRecorder() }; } private static void SendNotification(string title, string message) { try { var process = new System.Diagnostics.Process { StartInfo = new System.Diagnostics.ProcessStartInfo { FileName = "notify-send", Arguments = $"\"{title}\" \"{message}\"", UseShellExecute = false, CreateNoWindow = true } }; process.Start(); process.WaitForExit(); } catch { Console.WriteLine($"[Notification] {title}: {message}"); } } public async Task RunLatencyTestAsync() { var config = _configManager.Load(); var sttStopwatch = System.Diagnostics.Stopwatch.StartNew(); var llmStopwatch = new System.Diagnostics.Stopwatch(); var wavBytes = GenerateSilentWav(1.0); await using var wavStream = new MemoryStream(wavBytes); var transcription = await TranscribeStreamAsync(wavStream, config); sttStopwatch.Stop(); llmStopwatch.Start(); var processedText = await ProcessWithLlmAsync(transcription, config); llmStopwatch.Stop(); return new LatencyResult( (int)sttStopwatch.ElapsedMilliseconds, (int)llmStopwatch.ElapsedMilliseconds, (int)(sttStopwatch.ElapsedMilliseconds + llmStopwatch.ElapsedMilliseconds) ); } private async Task TranscribeStreamAsync(Stream stream, HushConfig config) { var provider = GetAudioToTextProvider(config); return await provider.TranscribeAsync(stream, config.WhisperModel); } private static byte[] GenerateSilentWav(double durationSeconds) { int sampleRate = 16000; short bitsPerSample = 16; int channels = 1; int dataChunkSize = (int)(sampleRate * durationSeconds * channels * (bitsPerSample / 8)); int fileSize = 36 + dataChunkSize; using var ms = new MemoryStream(); using var writer = new BinaryWriter(ms); writer.Write("RIFF"u8.ToArray()); writer.Write(fileSize); writer.Write("WAVE"u8.ToArray()); writer.Write("fmt "u8.ToArray()); writer.Write(16); writer.Write((short)1); writer.Write((short)channels); writer.Write(sampleRate); writer.Write(sampleRate * channels * (bitsPerSample / 8)); writer.Write((short)(channels * (bitsPerSample / 8))); writer.Write(bitsPerSample); writer.Write("data"u8.ToArray()); writer.Write(dataChunkSize); int samples = (int)(sampleRate * durationSeconds); for (int i = 0; i < samples; i++) { writer.Write((short)0); } return ms.ToArray(); } }