hush/Hush.Daemon/src/Orchestrator.cs

using Hush.Audio;
using Hush.Config;
using Hush.Input;
using Hush.Providers.Interfaces;
using Hush.Providers.Providers;

namespace Hush.Daemon;

public class Orchestrator
{
    private static readonly HttpClient _httpClient = new();

    private readonly ConfigManager _configManager;
    private readonly IAudioRecorder _recorder;

    private string? _recordingPath;
    private DateTime? _recordingStartTime;
    private bool _isRecording;
    private readonly Lock _lock = new();


    public Orchestrator(ConfigManager configManager)
    {
        _configManager = configManager;
        _recorder = CreateAudioRecorder();
    }

    public bool IsRecording
    {
        get
        {
            lock (_lock)
            {
                return _isRecording && _recorder.IsRecording;
            }
        }
    }

    public TimeSpan? GetRecordingDuration()
    {
        lock (_lock)
        {
            if (!_isRecording || !_recordingStartTime.HasValue)
                return null;

            return DateTime.UtcNow - _recordingStartTime.Value;
        }
    }

    public Task StartRecordingAsync()
    {
        lock (_lock)
        {
            if (_isRecording)
                throw new InvalidOperationException("Recording is already in progress");

            _recordingPath = Path.Combine(Path.GetTempPath(), $"hush_recording_{Guid.NewGuid()}.wav");
            _recordingStartTime = DateTime.UtcNow;
            _isRecording = true;
        }

        return _recorder.StartRecording(_recordingPath);
    }

    public async Task StopAndProcessAsync(HushConfig? overrideConfig = null)
    {
        string? recordingPath;
        DateTime? recordingStartTime;

        lock (_lock)
        {
            if (!_isRecording)
                return;

            recordingPath = _recordingPath;
            recordingStartTime = _recordingStartTime;
            _isRecording = false;
        }

        await _recorder.StopRecording();

        if (string.IsNullOrEmpty(recordingPath) || !File.Exists(recordingPath))
        {
            SendNotification("Error", "Recording file not found");
            return;
        }

        try
        {
            var config = overrideConfig ?? _configManager.Load();

            var recordingDuration = recordingStartTime.HasValue
                ? DateTime.UtcNow - recordingStartTime.Value
                : TimeSpan.Zero;
            var minDuration = TimeSpan.FromMilliseconds(config.MinRecordingDuration);
            if (recordingDuration < minDuration)
            {
                SendNotification("Hush", "Recording too short, ignored");
                File.Delete(recordingPath);
                return;
            }

            var transcription = await TranscribeAsync(recordingPath, config);
            var processedText = await ProcessWithLlmAsync(transcription, config);

            await TypeAsync(processedText, config);

            File.Delete(recordingPath);
        }
        catch (Exception ex)
        {
            SendNotification("Hush Error", ex.Message);
        }
    }

    public Task AbortAsync()
    {
        string? recordingPath;

        lock (_lock)
        {
            if (!_isRecording)
                return Task.CompletedTask;

            recordingPath = _recordingPath;
            _isRecording = false;
        }

        _ = _recorder.StopRecording();

        if (!string.IsNullOrEmpty(recordingPath) && File.Exists(recordingPath))
        {
            File.Delete(recordingPath);
        }

        return Task.CompletedTask;
    }

    private async Task<string> TranscribeAsync(string path, HushConfig config)
    {
        var provider = GetAudioToTextProvider(config);

        await using var stream = File.OpenRead(path);
        return await provider.TranscribeAsync(
            stream,
            config.WhisperModel,
            language: string.IsNullOrEmpty(config.WhisperLanguage) ? null : config.WhisperLanguage);
    }

    private const string DefaultSystemPrompt =
        """
        You are a transcription post-processor. Your task is to clean up raw speech-to-text output and return polished, ready-to-type text.

        Rules:
        - Detect the language of the transcription and process it entirely in that language — do not translate
        - Fix grammar, spelling, and punctuation errors introduced by the speech recognizer, following the conventions of the detected language
        - Capitalize sentences and proper nouns appropriately for the detected language
        - Remove filler words and false starts appropriate to the detected language (e.g. "um", "uh", "like" in English; "euh", "bah" in French; "äh", "ähm" in German; "eh", "tipo" in Spanish/Italian)
        - Preserve the speaker's original intent, vocabulary choices, and tone
        - Do not add, remove, or reinterpret content beyond what was said
        - Do not include any explanation, preamble, or metadata — output only the corrected text
        - If the input is empty or unintelligible, return an empty string
        """;

    private async Task<string> ProcessWithLlmAsync(string text, HushConfig config)
    {
        var provider = GetTextProvider(config);
        var systemPrompt = string.IsNullOrWhiteSpace(config.SystemPrompt)
            ? DefaultSystemPrompt
            : config.SystemPrompt;

        return await provider.CompleteTextAsync(systemPrompt, text, config.LlmModel);
    }

    public async Task<string> GenerateProfilePromptAsync(string description)
    {
        var config   = _configManager.Load();
        var provider = GetTextProvider(config);

        const string systemPrompt =
            """
            You are a configuration assistant for Hush, a Linux speech-to-text post-processor.
            Hush records the user's voice, transcribes it with Whisper, then passes the transcription
            to an LLM using a system prompt you will write.

            Given the user's description of what they want the profile to do, write a precise, concise
            system prompt that instructs the LLM how to transform the raw transcription.

            Rules:
            - Output only the system prompt text, nothing else
            - Do not include meta-commentary, labels, or markdown formatting
            - The prompt must be self-contained and unambiguous
            - Always end with an instruction to output only the final result with no explanation
            """;

        return await provider.CompleteTextAsync(systemPrompt, description, config.LlmModel);
    }

    private async Task TypeAsync(string text, HushConfig config)
    {
        var input = GetTextInput(config);
        await input.TypeString(text);
    }

    private IAudioToTextProvider GetAudioToTextProvider(HushConfig config) =>
        config.WhisperProvider switch
        {
            "groq" => string.IsNullOrEmpty(config.GroqApiKey)
                ? throw new InvalidOperationException("Groq API key is required for Whisper transcription")
                : new GroqProvider(config.GroqApiKey, _httpClient),
            "fireworks" => string.IsNullOrEmpty(config.FireworksApiKey)
                ? throw new InvalidOperationException("Fireworks API key is required for Whisper transcription")
                : new FireworksProvider(config.FireworksApiKey, _httpClient),
            _ => throw new InvalidOperationException($"Unsupported Whisper provider: {config.WhisperProvider}")
        };

    private ITextStreamingProvider GetTextProvider(HushConfig config) =>
        config.LlmProvider switch
        {
            "groq" => string.IsNullOrEmpty(config.GroqApiKey)
                ? throw new InvalidOperationException("Groq API key is required for LLM")
                : new GroqProvider(config.GroqApiKey, _httpClient),
            "fireworks" => string.IsNullOrEmpty(config.FireworksApiKey)
                ? throw new InvalidOperationException("Fireworks API key is required for LLM")
                : new FireworksProvider(config.FireworksApiKey, _httpClient),
            _ => throw new InvalidOperationException($"Unsupported LLM provider: {config.LlmProvider}")
        };

    private static ITextInput GetTextInput(HushConfig config) =>
        config.TypingBackend switch
        {
            "xdotool" => new XdotoolInput(),
            _ => new WtypeInput()
        };

    private IAudioRecorder CreateAudioRecorder()
    {
        var config = _configManager.Load();

        return config.AudioBackend switch
        {
            "ffmpeg" => new FfmpegAudioRecorder(),
            _ => new PipewireAudioRecorder()
        };
    }

    private static void SendNotification(string title, string message)
    {
        try
        {
            var process = new System.Diagnostics.Process
            {
                StartInfo = new System.Diagnostics.ProcessStartInfo
                {
                    FileName = "notify-send",
                    Arguments = $"\"{title}\" \"{message}\"",
                    UseShellExecute = false,
                    CreateNoWindow = true
                }
            };
            process.Start();
            process.WaitForExit();
        }
        catch
        {
            Console.WriteLine($"[Notification] {title}: {message}");
        }
    }

    public async Task<LatencyResult> RunLatencyTestAsync()
    {
        var config = _configManager.Load();

        var sttStopwatch = System.Diagnostics.Stopwatch.StartNew();
        var llmStopwatch = new System.Diagnostics.Stopwatch();

        var wavBytes = GenerateSilentWav(1.0);
        await using var wavStream = new MemoryStream(wavBytes);

        var transcription = await TranscribeStreamAsync(wavStream, config);

        sttStopwatch.Stop();
        llmStopwatch.Start();

        var processedText = await ProcessWithLlmAsync(transcription, config);

        llmStopwatch.Stop();

        return new LatencyResult(
            (int)sttStopwatch.ElapsedMilliseconds,
            (int)llmStopwatch.ElapsedMilliseconds,
            (int)(sttStopwatch.ElapsedMilliseconds + llmStopwatch.ElapsedMilliseconds)
        );
    }

    private async Task<string> TranscribeStreamAsync(Stream stream, HushConfig config)
    {
        var provider = GetAudioToTextProvider(config);
        return await provider.TranscribeAsync(stream, config.WhisperModel);
    }

    private static byte[] GenerateSilentWav(double durationSeconds)
    {
        int sampleRate = 16000;
        short bitsPerSample = 16;
        int channels = 1;

        int dataChunkSize = (int)(sampleRate * durationSeconds * channels * (bitsPerSample / 8));
        int fileSize = 36 + dataChunkSize;

        using var ms = new MemoryStream();
        using var writer = new BinaryWriter(ms);

        writer.Write("RIFF"u8.ToArray());
        writer.Write(fileSize);
        writer.Write("WAVE"u8.ToArray());

        writer.Write("fmt "u8.ToArray());
        writer.Write(16);
        writer.Write((short)1);
        writer.Write((short)channels);
        writer.Write(sampleRate);
        writer.Write(sampleRate * channels * (bitsPerSample / 8));
        writer.Write((short)(channels * (bitsPerSample / 8)));
        writer.Write(bitsPerSample);

        writer.Write("data"u8.ToArray());
        writer.Write(dataChunkSize);

        int samples = (int)(sampleRate * durationSeconds);
        for (int i = 0; i < samples; i++)
        {
            writer.Write((short)0);
        }

        return ms.ToArray();
    }
}