From 27d7d11b63d75b636ecf249ce44c1c848f13e6ab Mon Sep 17 00:00:00 2001 From: TomiEckert Date: Sat, 28 Feb 2026 12:42:41 +0100 Subject: [PATCH] feat: Implement daemon support for piping LLM output to stdout or clipboard via an extended socket protocol and update project documentation. --- Commands/ToggleCommand.cs | 20 ++++++++++++-- Core/DaemonService.cs | 39 ++++++++++++++++++++++----- DAEMON_PLAN.md | 57 --------------------------------------- docs/PROTOCOL.md | 56 ++++++++++++++++++++++++++++++++++++++ docs/STRUCTURE.md | 54 +++++++++++++++++++++++++++++++++++++ 5 files changed, 160 insertions(+), 66 deletions(-) delete mode 100644 DAEMON_PLAN.md create mode 100644 docs/PROTOCOL.md create mode 100644 docs/STRUCTURE.md diff --git a/Commands/ToggleCommand.cs b/Commands/ToggleCommand.cs index c5060fb..50d6881 100644 --- a/Commands/ToggleCommand.cs +++ b/Commands/ToggleCommand.cs @@ -20,13 +20,29 @@ public static class ToggleCommand var endPoint = new UnixDomainSocketEndPoint(socketPath); await socket.ConnectAsync(endPoint); - // Send TOGGLE (cmd == 4) - await socket.SendAsync(new byte[] { 4 }, SocketFlags.None); + // Send TOGGLE (cmd == 4), pipeToStdout, copyToClipboard + var msg = new byte[] { 4, (byte)(pipeToStdout ? 1 : 0), (byte)(copyToClipboard ? 1 : 0) }; + await socket.SendAsync(msg, SocketFlags.None); if (verbose) { Console.WriteLine("Sent TOGGLE command to daemon."); } + + // Wait for response text if pipeToStdout or if it takes a while we just wait until the socket closes + // This is required so the client process stays alive to receive the text through stdout + var responseBuffer = new byte[4096]; + while (true) + { + int received = await socket.ReceiveAsync(responseBuffer, SocketFlags.None); + if (received == 0) break; // socket closed by daemon + + if (pipeToStdout) + { + var text = System.Text.Encoding.UTF8.GetString(responseBuffer, 0, received); + Console.Write(text); + } + } } catch (SocketException) { diff --git a/Core/DaemonService.cs b/Core/DaemonService.cs index 2c3c4bc..5a1bd38 100644 --- a/Core/DaemonService.cs +++ b/Core/DaemonService.cs @@ -77,18 +77,21 @@ public static class DaemonService { try { - var buffer = new byte[1]; + var buffer = new byte[3]; int bytesRead = await client.ReceiveAsync(buffer, SocketFlags.None); if (bytesRead > 0) { byte cmd = buffer[0]; + bool pipeToStdout = bytesRead > 1 && buffer[1] == 1; + bool copyToClipboard = bytesRead > 2 && buffer[2] == 1; + if (cmd == 1) // START { await ProcessStartRecordingAsync(); } else if (cmd == 2) // STOP { - await ProcessStopRecordingAsync(); + await ProcessStopRecordingAsync(client, pipeToStdout, copyToClipboard); } else if (cmd == 3) // ABORT { @@ -97,7 +100,7 @@ public static class DaemonService else if (cmd == 4) // TOGGLE { if (StateTracker.IsRecording()) - await ProcessStopRecordingAsync(); + await ProcessStopRecordingAsync(client, pipeToStdout, copyToClipboard); else await ProcessStartRecordingAsync(); } @@ -123,7 +126,7 @@ public static class DaemonService AudioRecorder.StartRecording(); } - private static async Task ProcessStopRecordingAsync() + private static async Task ProcessStopRecordingAsync(Socket client, bool pipeToStdout, bool copyToClipboard) { if (!StateTracker.IsRecording()) return; @@ -173,9 +176,31 @@ public static class DaemonService { Logger.LogDebug("Starting LLM text refinement (streaming)..."); var tokenStream = _groqClient.RefineTextStreamAsync(transcript, systemPrompt, config.LlmModel); - await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend); - stopWatch.Stop(); - Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms"); + + if (pipeToStdout || copyToClipboard) + { + string fullText = ""; + await foreach (var token in tokenStream) + { + fullText += token; + if (pipeToStdout) + { + await client.SendAsync(System.Text.Encoding.UTF8.GetBytes(token), SocketFlags.None); + } + } + stopWatch.Stop(); + if (copyToClipboard) + { + ClipboardManager.Copy(fullText); + Notifications.Notify("Toak", $"Copied to clipboard in {stopWatch.ElapsedMilliseconds}ms"); + } + } + else + { + await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend); + stopWatch.Stop(); + Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms"); + } } } catch (Exception ex) diff --git a/DAEMON_PLAN.md b/DAEMON_PLAN.md deleted file mode 100644 index d0913a2..0000000 --- a/DAEMON_PLAN.md +++ /dev/null @@ -1,57 +0,0 @@ -# Toak: Client-Server & PipeWire Architecture Specification - -This document outlines the transition of Toak from a monolithic, ephemeral CLI application to a persistent, low-latency background daemon utilizing Linux Inter-Process Communication (IPC) and PipeWire. - -## 1. System Architecture Overview - -The system is divided into two distinct binaries to separate the heavy runtime environment from the instant-trigger mechanism. - -* **Toak Daemon (`toakd`):** A persistent C# background service. It holds the API connections, memory buffers, and audio routing open. -* **Toak Client (`toak`):** A lightweight, ephemeral trigger executed by the window manager that simply sends signals to the daemon. - -## 2. The Toak Daemon (Server) - -Built as a C# `.NET Hosted Service`, this component runs continuously in the background and manages three primary responsibilities: - -### A. Unix Domain Socket Listener - -* Listens on a secure, user-space socket (e.g., `/run/user/1000/toak.sock`). -* Awaits basic byte-sized instructions from the client (e.g., `START_RECORDING`, `STOP_RECORDING`, `ABORT`). -* Ensures single-instance execution and rejects unauthorized cross-user connections. - -### B. PipeWire Audio Node - -* Connects to the PipeWire graph as a native audio sink. -* Dynamically links to the default system microphone *only* upon receiving the `START_RECORDING` signal. -* Reads the audio stream directly into a pre-allocated C# `MemoryStream` via memory-mapped buffers (zero-copy), requesting the exact format required by the Groq Whisper API (e.g., 16kHz, mono). -* Unlinks from the microphone instantly upon receiving the `STOP_RECORDING` signal, freeing the hardware device. - -### C. State & API Management - -* Maintains a persistent `HttpClient` connection pool to Groq, eliminating TLS handshake overhead for each dictation. -* Triggers the Wayland (`wtype`) or X11 (`xdotool`) typing backend as a child process once the refined transcription is returned. - -## 3. The Toak Client (Trigger) - -A minimal executable designed to be fired by global window manager hotkeys (e.g., Sway, Hyprland, KDE). - -* **Stateless:** Contains no audio logic, API keys, or large library dependencies. -* **Execution:** Connects to the daemon's Unix socket, writes a specific control byte, and exits immediately. -* **Latency:** Execution time is measured in microseconds, preventing any blocking of the desktop compositor's input thread. - -## 4. Deployment & Lifecycle Management - -The daemon is managed by the host's native init system to ensure uptime and clean restarts. - -* **Systemd User Service:** Installed as `~/.config/systemd/user/toak.service`. -* **Lifecycle:** Starts automatically on user login (`default.target`), restarts automatically on failure, and manages its own logging via `journalctl`. -* **Environment:** Inherits the active Wayland/X11 display variables necessary for the typing backends to inject keystrokes into the active window. - -## 5. Execution Flow (The PTT Lifecycle) - -1. **Init:** User logs in. Systemd starts `toakd`. It allocates memory, opens API connections, and begins listening on the Unix socket. -2. **KeyDown:** User holds the Push-to-Talk hotkey. Window manager executes `toak --start`. -3. **Link:** `toakd` receives the signal over the socket and tells PipeWire to link the microphone to its internal buffer. -4. **Dictation:** User speaks. Audio fills the C# `MemoryStream`. -5. **KeyUp:** User releases the hotkey. Window manager executes `toak --stop`. -6. **Unlink & Send:** `toakd` unlinks the microphone, flushes the memory buffer directly to the Groq API, receives the transcription, and executes the typing backend. \ No newline at end of file diff --git a/docs/PROTOCOL.md b/docs/PROTOCOL.md new file mode 100644 index 0000000..f7c08b2 --- /dev/null +++ b/docs/PROTOCOL.md @@ -0,0 +1,56 @@ +# Daemon Socket Protocol + +Toak uses a lightweight, custom Unix Domain Socket protocol for IPC (Inter-Process Communication). This allows front-end short-lived CLI tools (like `toak toggle`) to execute instantly while the persistent state and API operations happen inside the background daemon (`toak daemon`). + +## Connection + +The UNIX domain socket is typically located at: +`$XDG_RUNTIME_DIR/toak.sock` (falls back to `/tmp/toak.sock` if `$XDG_RUNTIME_DIR` is not set). + +## Message Format + +Clients send small byte arrays to issue commands to the server. Depending on the command, the structure ranges from a single byte to a 3-byte payload containing the command ID and configuration flags for standard output handling. + +### Command Bytes + +| Command | Byte | Description | +|---|---|---| +| **START** | `1` | Forces the daemon to start recording. Ignored if already recording. | +| **STOP** | `2` | Forces the daemon to stop recording and begin processing the audio. Takes flags for response handling. | +| **ABORT** | `3` | Stops audio recording and discards the buffer without making API calls. | +| **TOGGLE** | `4` | Stops recording if currently recording; starts recording if currently inactive. Takes flags for response handling. | + +## Payload Formats + +### 1-Byte Payloads (`START`, `ABORT`) +When the client only needs to trigger state changes without receiving processing results back, it sends a single byte. + +```text +[ Command Byte ] +``` +Example (`ABORT`): `[ 0x03 ]` + +### 3-Byte Payloads (`STOP`, `TOGGLE`) +When asking the daemon to process audio, the client can specify how it wants to receive the finalized text: typed via hotkeys (default), piped to standard output (`--pipe`), or copied to the clipboard (`--copy`). + +The client sends exactly 3 bytes: +```text +[ Command Byte ] [ Pipe Flag ] [ Copy Flag ] +``` + +- **Byte 0:** The command (`0x02` or `0x04`) +- **Byte 1:** Pipe to Stdout: `0x01` if enabled, `0x00` if disabled. +- **Byte 2:** Copy to Clipboard: `0x01` if enabled, `0x00` if disabled. + +Example (`TOGGLE` with stdout piping enabled): `[ 0x04, 0x01, 0x00 ]` + +## Server Responses + +Depending on the flags provided in a 3-Byte Payload: + +1. **Default (No flags set):** + The server will process the audio, handle LLM modifications, and inject the text into the user's active window using Wayland (`wtype`) or X11 (`xdotool`). The socket is closed by the server. + +2. **Pipe or Copy Flag Set:** + The client will keep the connection open and wait to read the incoming text from the server. + The server will stream UTF-8 encoded text chunks back to the client as the LLM generates them. The client reads these chunks and pushes them to `stdout`. Once sending is complete, the server closes the socket. diff --git a/docs/STRUCTURE.md b/docs/STRUCTURE.md new file mode 100644 index 0000000..7889ad4 --- /dev/null +++ b/docs/STRUCTURE.md @@ -0,0 +1,54 @@ +# Toak Project Structure + +This document outlines the high-level architecture and directory structure of the Toak project to help contributors navigate the codebase. + +## Overview + +Toak is designed as a fast, Linux-native dictation application utilizing C# AOT (Ahead-Of-Time compilation) for minimal latency. It operates primarily as a client-daemon architecture where background application state is managed by a daemon process while short-lived CLI commands issue control messages via Unix domain sockets. + +## Directory Structure + +```text +Toak/ +├── Api/ +│ ├── GroqApiClient.cs # Client for external transcription and LLM API calls (Groq/Whisper) +│ └── Models/ # API payload representations +├── Assets/ # Sound files or other static resources +├── Audio/ +│ └── AudioRecorder.cs # Handles audio capture via system utilities (e.g., ffmpeg/arecord) +├── Commands/ +│ ├── ToggleCommand.cs # Start/stop recording and pass pipe/copy flags +│ ├── DiscardCommand.cs # Abort the current recording +│ ├── OnboardCommand.cs # Initial interactive configuration setup +│ ├── ConfigUpdaterCommand.cs # Direct configuration modifications +│ ├── ShowCommand.cs # Display current configuration +│ └── LatencyTestCommand.cs # Benchmark tool for API calls +├── Configuration/ +│ ├── ConfigManager.cs # Loads and saves JSON configuration from the user's home folder +│ └── ToakConfig.cs # Data model for user preferences +├── Core/ +│ ├── DaemonService.cs # The background daemon maintaining the socket server and handling states +│ ├── Logger.cs # Logging utility (verbose logging) +│ ├── PromptBuilder.cs # Constructs the system prompts for the LLM based on user settings +│ ├── StateTracker.cs # Tracks the current application state (e.g. is recording active?) +│ └── Skills/ # Modular capabilities (e.g., Terminal mode, Language Translation) +├── IO/ +│ ├── ClipboardManager.cs # Cross-session (Wayland/X11) clipboard manipulation (`wl-copy`, `xclip`) +│ ├── TextInjector.cs # Native keyboard injection handling (`wtype`, `xdotool`) +│ └── Notifications.cs # System notifications (`notify-send`) and sound playback (`paplay`) +├── Serialization/ +│ └── AppJsonSerializerContext.cs # System.Text.Json source generation context for AOT support +├── docs/ # Documentation +└── Program.cs # Application entry point using System.CommandLine +``` + +## Key Architectural Concepts + +### The Daemon Process +The `DaemonService` (`toak daemon`) is the heart of Toak. It listens on a Unix domain socket for IPC messages. This allows `toak toggle` to execute almost instantaneously, delegating all heavy lifting and state management to an already-hot background process. + +### Unix Sockets IPC +Client commands communicate with the daemon via Unix sockets. For details on the byte payloads used for communication, please refer to [PROTOCOL.md](./PROTOCOL.md). + +### AOT Compilation +The project relies on Native AOT compilation (`dotnet publish -c Release -r linux-x64 --aot`) to avoid JIT-startup time on CLI executions, making `toak toggle` fast enough to bind seamlessly to hotkeys.