feat: Implement daemon support for piping LLM output to stdout or clipboard via an extended socket protocol and update project documentation.
This commit is contained in:
@@ -20,13 +20,29 @@ public static class ToggleCommand
|
||||
var endPoint = new UnixDomainSocketEndPoint(socketPath);
|
||||
await socket.ConnectAsync(endPoint);
|
||||
|
||||
// Send TOGGLE (cmd == 4)
|
||||
await socket.SendAsync(new byte[] { 4 }, SocketFlags.None);
|
||||
// Send TOGGLE (cmd == 4), pipeToStdout, copyToClipboard
|
||||
var msg = new byte[] { 4, (byte)(pipeToStdout ? 1 : 0), (byte)(copyToClipboard ? 1 : 0) };
|
||||
await socket.SendAsync(msg, SocketFlags.None);
|
||||
|
||||
if (verbose)
|
||||
{
|
||||
Console.WriteLine("Sent TOGGLE command to daemon.");
|
||||
}
|
||||
|
||||
// Wait for response text if pipeToStdout or if it takes a while we just wait until the socket closes
|
||||
// This is required so the client process stays alive to receive the text through stdout
|
||||
var responseBuffer = new byte[4096];
|
||||
while (true)
|
||||
{
|
||||
int received = await socket.ReceiveAsync(responseBuffer, SocketFlags.None);
|
||||
if (received == 0) break; // socket closed by daemon
|
||||
|
||||
if (pipeToStdout)
|
||||
{
|
||||
var text = System.Text.Encoding.UTF8.GetString(responseBuffer, 0, received);
|
||||
Console.Write(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (SocketException)
|
||||
{
|
||||
|
||||
@@ -77,18 +77,21 @@ public static class DaemonService
|
||||
{
|
||||
try
|
||||
{
|
||||
var buffer = new byte[1];
|
||||
var buffer = new byte[3];
|
||||
int bytesRead = await client.ReceiveAsync(buffer, SocketFlags.None);
|
||||
if (bytesRead > 0)
|
||||
{
|
||||
byte cmd = buffer[0];
|
||||
bool pipeToStdout = bytesRead > 1 && buffer[1] == 1;
|
||||
bool copyToClipboard = bytesRead > 2 && buffer[2] == 1;
|
||||
|
||||
if (cmd == 1) // START
|
||||
{
|
||||
await ProcessStartRecordingAsync();
|
||||
}
|
||||
else if (cmd == 2) // STOP
|
||||
{
|
||||
await ProcessStopRecordingAsync();
|
||||
await ProcessStopRecordingAsync(client, pipeToStdout, copyToClipboard);
|
||||
}
|
||||
else if (cmd == 3) // ABORT
|
||||
{
|
||||
@@ -97,7 +100,7 @@ public static class DaemonService
|
||||
else if (cmd == 4) // TOGGLE
|
||||
{
|
||||
if (StateTracker.IsRecording())
|
||||
await ProcessStopRecordingAsync();
|
||||
await ProcessStopRecordingAsync(client, pipeToStdout, copyToClipboard);
|
||||
else
|
||||
await ProcessStartRecordingAsync();
|
||||
}
|
||||
@@ -123,7 +126,7 @@ public static class DaemonService
|
||||
AudioRecorder.StartRecording();
|
||||
}
|
||||
|
||||
private static async Task ProcessStopRecordingAsync()
|
||||
private static async Task ProcessStopRecordingAsync(Socket client, bool pipeToStdout, bool copyToClipboard)
|
||||
{
|
||||
if (!StateTracker.IsRecording()) return;
|
||||
|
||||
@@ -173,9 +176,31 @@ public static class DaemonService
|
||||
{
|
||||
Logger.LogDebug("Starting LLM text refinement (streaming)...");
|
||||
var tokenStream = _groqClient.RefineTextStreamAsync(transcript, systemPrompt, config.LlmModel);
|
||||
await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend);
|
||||
stopWatch.Stop();
|
||||
Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms");
|
||||
|
||||
if (pipeToStdout || copyToClipboard)
|
||||
{
|
||||
string fullText = "";
|
||||
await foreach (var token in tokenStream)
|
||||
{
|
||||
fullText += token;
|
||||
if (pipeToStdout)
|
||||
{
|
||||
await client.SendAsync(System.Text.Encoding.UTF8.GetBytes(token), SocketFlags.None);
|
||||
}
|
||||
}
|
||||
stopWatch.Stop();
|
||||
if (copyToClipboard)
|
||||
{
|
||||
ClipboardManager.Copy(fullText);
|
||||
Notifications.Notify("Toak", $"Copied to clipboard in {stopWatch.ElapsedMilliseconds}ms");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend);
|
||||
stopWatch.Stop();
|
||||
Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms");
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
# Toak: Client-Server & PipeWire Architecture Specification
|
||||
|
||||
This document outlines the transition of Toak from a monolithic, ephemeral CLI application to a persistent, low-latency background daemon utilizing Linux Inter-Process Communication (IPC) and PipeWire.
|
||||
|
||||
## 1. System Architecture Overview
|
||||
|
||||
The system is divided into two distinct binaries to separate the heavy runtime environment from the instant-trigger mechanism.
|
||||
|
||||
* **Toak Daemon (`toakd`):** A persistent C# background service. It holds the API connections, memory buffers, and audio routing open.
|
||||
* **Toak Client (`toak`):** A lightweight, ephemeral trigger executed by the window manager that simply sends signals to the daemon.
|
||||
|
||||
## 2. The Toak Daemon (Server)
|
||||
|
||||
Built as a C# `.NET Hosted Service`, this component runs continuously in the background and manages three primary responsibilities:
|
||||
|
||||
### A. Unix Domain Socket Listener
|
||||
|
||||
* Listens on a secure, user-space socket (e.g., `/run/user/1000/toak.sock`).
|
||||
* Awaits basic byte-sized instructions from the client (e.g., `START_RECORDING`, `STOP_RECORDING`, `ABORT`).
|
||||
* Ensures single-instance execution and rejects unauthorized cross-user connections.
|
||||
|
||||
### B. PipeWire Audio Node
|
||||
|
||||
* Connects to the PipeWire graph as a native audio sink.
|
||||
* Dynamically links to the default system microphone *only* upon receiving the `START_RECORDING` signal.
|
||||
* Reads the audio stream directly into a pre-allocated C# `MemoryStream` via memory-mapped buffers (zero-copy), requesting the exact format required by the Groq Whisper API (e.g., 16kHz, mono).
|
||||
* Unlinks from the microphone instantly upon receiving the `STOP_RECORDING` signal, freeing the hardware device.
|
||||
|
||||
### C. State & API Management
|
||||
|
||||
* Maintains a persistent `HttpClient` connection pool to Groq, eliminating TLS handshake overhead for each dictation.
|
||||
* Triggers the Wayland (`wtype`) or X11 (`xdotool`) typing backend as a child process once the refined transcription is returned.
|
||||
|
||||
## 3. The Toak Client (Trigger)
|
||||
|
||||
A minimal executable designed to be fired by global window manager hotkeys (e.g., Sway, Hyprland, KDE).
|
||||
|
||||
* **Stateless:** Contains no audio logic, API keys, or large library dependencies.
|
||||
* **Execution:** Connects to the daemon's Unix socket, writes a specific control byte, and exits immediately.
|
||||
* **Latency:** Execution time is measured in microseconds, preventing any blocking of the desktop compositor's input thread.
|
||||
|
||||
## 4. Deployment & Lifecycle Management
|
||||
|
||||
The daemon is managed by the host's native init system to ensure uptime and clean restarts.
|
||||
|
||||
* **Systemd User Service:** Installed as `~/.config/systemd/user/toak.service`.
|
||||
* **Lifecycle:** Starts automatically on user login (`default.target`), restarts automatically on failure, and manages its own logging via `journalctl`.
|
||||
* **Environment:** Inherits the active Wayland/X11 display variables necessary for the typing backends to inject keystrokes into the active window.
|
||||
|
||||
## 5. Execution Flow (The PTT Lifecycle)
|
||||
|
||||
1. **Init:** User logs in. Systemd starts `toakd`. It allocates memory, opens API connections, and begins listening on the Unix socket.
|
||||
2. **KeyDown:** User holds the Push-to-Talk hotkey. Window manager executes `toak --start`.
|
||||
3. **Link:** `toakd` receives the signal over the socket and tells PipeWire to link the microphone to its internal buffer.
|
||||
4. **Dictation:** User speaks. Audio fills the C# `MemoryStream`.
|
||||
5. **KeyUp:** User releases the hotkey. Window manager executes `toak --stop`.
|
||||
6. **Unlink & Send:** `toakd` unlinks the microphone, flushes the memory buffer directly to the Groq API, receives the transcription, and executes the typing backend.
|
||||
56
docs/PROTOCOL.md
Normal file
56
docs/PROTOCOL.md
Normal file
@@ -0,0 +1,56 @@
|
||||
# Daemon Socket Protocol
|
||||
|
||||
Toak uses a lightweight, custom Unix Domain Socket protocol for IPC (Inter-Process Communication). This allows front-end short-lived CLI tools (like `toak toggle`) to execute instantly while the persistent state and API operations happen inside the background daemon (`toak daemon`).
|
||||
|
||||
## Connection
|
||||
|
||||
The UNIX domain socket is typically located at:
|
||||
`$XDG_RUNTIME_DIR/toak.sock` (falls back to `/tmp/toak.sock` if `$XDG_RUNTIME_DIR` is not set).
|
||||
|
||||
## Message Format
|
||||
|
||||
Clients send small byte arrays to issue commands to the server. Depending on the command, the structure ranges from a single byte to a 3-byte payload containing the command ID and configuration flags for standard output handling.
|
||||
|
||||
### Command Bytes
|
||||
|
||||
| Command | Byte | Description |
|
||||
|---|---|---|
|
||||
| **START** | `1` | Forces the daemon to start recording. Ignored if already recording. |
|
||||
| **STOP** | `2` | Forces the daemon to stop recording and begin processing the audio. Takes flags for response handling. |
|
||||
| **ABORT** | `3` | Stops audio recording and discards the buffer without making API calls. |
|
||||
| **TOGGLE** | `4` | Stops recording if currently recording; starts recording if currently inactive. Takes flags for response handling. |
|
||||
|
||||
## Payload Formats
|
||||
|
||||
### 1-Byte Payloads (`START`, `ABORT`)
|
||||
When the client only needs to trigger state changes without receiving processing results back, it sends a single byte.
|
||||
|
||||
```text
|
||||
[ Command Byte ]
|
||||
```
|
||||
Example (`ABORT`): `[ 0x03 ]`
|
||||
|
||||
### 3-Byte Payloads (`STOP`, `TOGGLE`)
|
||||
When asking the daemon to process audio, the client can specify how it wants to receive the finalized text: typed via hotkeys (default), piped to standard output (`--pipe`), or copied to the clipboard (`--copy`).
|
||||
|
||||
The client sends exactly 3 bytes:
|
||||
```text
|
||||
[ Command Byte ] [ Pipe Flag ] [ Copy Flag ]
|
||||
```
|
||||
|
||||
- **Byte 0:** The command (`0x02` or `0x04`)
|
||||
- **Byte 1:** Pipe to Stdout: `0x01` if enabled, `0x00` if disabled.
|
||||
- **Byte 2:** Copy to Clipboard: `0x01` if enabled, `0x00` if disabled.
|
||||
|
||||
Example (`TOGGLE` with stdout piping enabled): `[ 0x04, 0x01, 0x00 ]`
|
||||
|
||||
## Server Responses
|
||||
|
||||
Depending on the flags provided in a 3-Byte Payload:
|
||||
|
||||
1. **Default (No flags set):**
|
||||
The server will process the audio, handle LLM modifications, and inject the text into the user's active window using Wayland (`wtype`) or X11 (`xdotool`). The socket is closed by the server.
|
||||
|
||||
2. **Pipe or Copy Flag Set:**
|
||||
The client will keep the connection open and wait to read the incoming text from the server.
|
||||
The server will stream UTF-8 encoded text chunks back to the client as the LLM generates them. The client reads these chunks and pushes them to `stdout`. Once sending is complete, the server closes the socket.
|
||||
54
docs/STRUCTURE.md
Normal file
54
docs/STRUCTURE.md
Normal file
@@ -0,0 +1,54 @@
|
||||
# Toak Project Structure
|
||||
|
||||
This document outlines the high-level architecture and directory structure of the Toak project to help contributors navigate the codebase.
|
||||
|
||||
## Overview
|
||||
|
||||
Toak is designed as a fast, Linux-native dictation application utilizing C# AOT (Ahead-Of-Time compilation) for minimal latency. It operates primarily as a client-daemon architecture where background application state is managed by a daemon process while short-lived CLI commands issue control messages via Unix domain sockets.
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```text
|
||||
Toak/
|
||||
├── Api/
|
||||
│ ├── GroqApiClient.cs # Client for external transcription and LLM API calls (Groq/Whisper)
|
||||
│ └── Models/ # API payload representations
|
||||
├── Assets/ # Sound files or other static resources
|
||||
├── Audio/
|
||||
│ └── AudioRecorder.cs # Handles audio capture via system utilities (e.g., ffmpeg/arecord)
|
||||
├── Commands/
|
||||
│ ├── ToggleCommand.cs # Start/stop recording and pass pipe/copy flags
|
||||
│ ├── DiscardCommand.cs # Abort the current recording
|
||||
│ ├── OnboardCommand.cs # Initial interactive configuration setup
|
||||
│ ├── ConfigUpdaterCommand.cs # Direct configuration modifications
|
||||
│ ├── ShowCommand.cs # Display current configuration
|
||||
│ └── LatencyTestCommand.cs # Benchmark tool for API calls
|
||||
├── Configuration/
|
||||
│ ├── ConfigManager.cs # Loads and saves JSON configuration from the user's home folder
|
||||
│ └── ToakConfig.cs # Data model for user preferences
|
||||
├── Core/
|
||||
│ ├── DaemonService.cs # The background daemon maintaining the socket server and handling states
|
||||
│ ├── Logger.cs # Logging utility (verbose logging)
|
||||
│ ├── PromptBuilder.cs # Constructs the system prompts for the LLM based on user settings
|
||||
│ ├── StateTracker.cs # Tracks the current application state (e.g. is recording active?)
|
||||
│ └── Skills/ # Modular capabilities (e.g., Terminal mode, Language Translation)
|
||||
├── IO/
|
||||
│ ├── ClipboardManager.cs # Cross-session (Wayland/X11) clipboard manipulation (`wl-copy`, `xclip`)
|
||||
│ ├── TextInjector.cs # Native keyboard injection handling (`wtype`, `xdotool`)
|
||||
│ └── Notifications.cs # System notifications (`notify-send`) and sound playback (`paplay`)
|
||||
├── Serialization/
|
||||
│ └── AppJsonSerializerContext.cs # System.Text.Json source generation context for AOT support
|
||||
├── docs/ # Documentation
|
||||
└── Program.cs # Application entry point using System.CommandLine
|
||||
```
|
||||
|
||||
## Key Architectural Concepts
|
||||
|
||||
### The Daemon Process
|
||||
The `DaemonService` (`toak daemon`) is the heart of Toak. It listens on a Unix domain socket for IPC messages. This allows `toak toggle` to execute almost instantaneously, delegating all heavy lifting and state management to an already-hot background process.
|
||||
|
||||
### Unix Sockets IPC
|
||||
Client commands communicate with the daemon via Unix sockets. For details on the byte payloads used for communication, please refer to [PROTOCOL.md](./PROTOCOL.md).
|
||||
|
||||
### AOT Compilation
|
||||
The project relies on Native AOT compilation (`dotnet publish -c Release -r linux-x64 --aot`) to avoid JIT-startup time on CLI executions, making `toak toggle` fast enough to bind seamlessly to hotkeys.
|
||||
Reference in New Issue
Block a user