diff --git a/Program.cs b/Program.cs index 02206df..85a185e 100644 --- a/Program.cs +++ b/Program.cs @@ -24,7 +24,7 @@ public class Program rootCommand.AddCommand(toggleCmd); // Daemon Command - var daemonCmd = new Command("daemon", "Starts the background background service"); + var daemonCmd = new Command("daemon", "Starts the background service"); daemonCmd.SetHandler(Toak.Core.DaemonService.StartAsync, verboseOption); rootCommand.AddCommand(daemonCmd); diff --git a/README.md b/README.md index 66216cf..0612f36 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # Toak: High-speed Linux Dictation -Toak is a high-speed, professional-grade dictation tool for Linux. It combines state-of-the-art Speech-to-Text (Whisper via Groq) with LLM refinement (Llama/GPT) to provide a seamless, articulate, and highly configurable dictation experience. +Toak is a high-speed, professional-grade dictation tool for Linux. It combines state-of-the-art Speech-to-Text (Whisper via Groq) with LLM refinement (Llama/GPT via Groq or Together AI) to provide a seamless, articulate, and highly configurable dictation experience. -Built with **.NET 10** and compiled to **Native AOT**, Toak runs as a lightning-fast standalone binary with zero runtime overhead. +Built with **.NET 10** and compiled to **Native AOT**, Toak runs as a lightning-fast standalone binary with zero runtime overhead, using a client-daemon architecture for near-zero latency. --- @@ -11,8 +11,11 @@ Built with **.NET 10** and compiled to **Native AOT**, Toak runs as a lightning- - **Blazing Fast**: Uses Groq's API for sub-second Whisper transcription and LLM refinement. - **Native AOT**: Compiled to a native Linux binary for instant startup and minimal footprint. - **Intelligent Refinement**: Automatically fixes grammar, punctuation, and technical terms while preserving your voice. +- **Multi-Provider LLM**: Supports both **Groq** and **Together AI** for text refinement, including Llama 3 models. +- **Reasoning Capabilities**: Optional reasoning effort settings for complex text processing. - **Modular Skills**: Actionable "System" commands for translation, terminal execution, professional rewriting, and summarization. -- **Multiple Backends**: Types directly into your active window (`wtype` or `xdotool`), copies to clipboard, or pipes to stdout. +- **Multiple Backends**: Types directly into your active window (`wtype`, `xdotool`, or `ydotool`), copies to clipboard, or pipes to stdout. +- **High-Quality Audio**: Native support for **PipeWire** (`pw-record`) and **FFmpeg** for universal compatibility. - **Beautiful CLI**: Interactive onboarding and configuration powered by `Spectre.Console`. --- @@ -20,9 +23,10 @@ Built with **.NET 10** and compiled to **Native AOT**, Toak runs as a lightning- ## 🛠 Prerequisites - **.NET 10 SDK** (for building from source) -- **pipewire** / **pw-record** (for native Wayland/Linux audio capture) -- **Typing Backend**: `wtype` (Wayland) or `xdotool` (X11) -- **Groq API Key**: Get one at [console.groq.com](https://console.groq.com/) +- **Audio Capture**: `pipewire` / `pw-record` (recommended) or `ffmpeg` +- **Typing Backend**: `wtype` (Wayland), `xdotool` (X11), or `ydotool` (Virtual Input) +- **Clipboard**: `wl-copy` (Wayland) or `xclip` (X11) +- **API Keys**: Groq (required for Whisper) and optionally Together AI. --- @@ -51,23 +55,26 @@ To remove Toak from your system, simply run: ## 🎮 Usage +Toak operates as a persistent **daemon** that manages state and API calls, while the **CLI** sends commands to it via Unix sockets. + ### Core Commands -- **`toak toggle`**: The primary command. Run it to start recording; run it again to stop, transcribe, and type/copy the result. +- **`toak toggle`**: The primary command. Start recording; run again to stop, transcribe, and type/copy/stdout the result. +- **`toak daemon`**: Runs the background service manually (usually managed by systemd). +- **`toak onboard`**: Launches the interactive configuration wizard for providers, models, and backends. - **`toak discard`**: Instantly aborts the current recording without performing any transcription. -- **`toak onboard`**: Launches the interactive configuration wizard. - **`toak latency-test`**: Benchmarks your network and API latency to ensure optimal performance. - **`toak show`**: Displays your current configuration in a clean table. -- **`toak config `**: Quickly update a specific setting (e.g., `toak config whisper whisper-large-v3-turbo`). -- **`toak skill`**: Manage dynamic JSON skills via `list`, `add`, or `remove` subcommands. -- **`toak history`**: Display your recent dictation history (`-n `, `--grep `, `--export `, `--shred`). -- **`toak stats`**: Display usage statistics and analytics like most active day and top words. +- **`toak config `**: Update settings like `llm`, `whisper`, `lang`, `backend`, `punctuation`, or `tech`. +- **`toak skill`**: Manage dynamic JSON skills via `list`, `add`, or `remove`. +- **`toak history`**: Query transcription history (`-n `, `--grep `, `--export `, `--shred`). +- **`toak stats`**: Display usage statistics like total recordings, latency, and top words. -### Flags +### Global Flags -- `-p, --pipe`: Output the finalized text to `stdout` instead of typing it. -- `--copy`: Copy the result to the system clipboard. -- `-v, --verbose`: Enable detailed debug logging. +- `-p, --pipe`: Stop recording and output the finalized text to `stdout` instead of typing it. +- `--copy`: Stop recording and copy the result to the system clipboard. +- `-v, --verbose`: Enable detailed debug logging for troubleshooting. --- @@ -75,34 +82,29 @@ To remove Toak from your system, simply run: Toak includes a robust, data-driven skills system triggered by saying hotwords (like **"System"**) at the start of your dictation. Skills are defined as simple JSON files entirely configurable without modifying the C# codebase. -At runtime, skills are loaded from `~/.config/toak/skills/`. - ### Default Skills | Skill | Hotwords | Description | Type | | :--- | :--- | :--- | :--- | -| **Terminal** | "System terminal", "System run" | Passes the command to `terminal_action.sh` to execute the spoken shell command. | `script` | +| **Terminal** | "System terminal", "System run" | Translates request into a bash command and executes it via `terminal_action.sh`. | `script` | | **Translate** | "System translate to [language]" | Translates your dictation into the target language. | `type` | | **Professional**| "System professional", "System formalize" | Rewrites your text to be articulate and formal. | `type` | | **Summary** | "System summary", "System concise" | Strips fluff and provides a direct, crisp summary. | `type` | ### Customizing & Adding Skills -You can build infinite capabilities using the CLI wizard: -```bash -toak skill add -``` - -This generates a JSON file in `~/.config/toak/skills/`. Skills support two actions: -1. **`type` Action**: Transforms the dictated text via a custom System Prompt and types it into the active window (or pipes/copies if flags are requested). -2. **`script` Action**: Transforms the text and passes the result as `$1` to a local shell script, allowing Toak to control system functions, APIs, or smart home devices completely autonomously. +Run `toak skill add` to build interactive custom capabilities. All skills live in `~/.config/toak/skills/`. --- ## ⚙️ Configuration -Toak's behavior is defined in `ToakConfig.cs` and can be managed via the CLI. Key settings include: +Key settings in `ToakConfig.cs` (managed via `toak onboard` or `toak config`): +- `LlmProvider`: Choice of `groq` (default) or `together`. +- `LlmModel`: The refinement model (e.g., `llama-3.3-70b-versatile`). +- `ReasoningEffort`: Set to `none` (default) or `low` for O1-style reasoning. - `WhisperModel`: The STT model (default: `whisper-large-v3-turbo`). -- `LlmModel`: The refinement model (default: `openai/gpt-oss-20b`). -- `TypingBackend`: Choose between `wtype` (Wayland) or `xdotool` (X11). +- `WhisperLanguage`: Set spoken language (e.g., `en`, `es`, `fr`). +- `TypingBackend`: Choose between `wtype`, `xdotool`, or `ydotool`. +- `AudioBackend`: Choose between `pw-record` (PipeWire) or `ffmpeg`. - `ModulePunctuation`: Toggle automatic grammar and punctuation fixing. -- `ModuleTechnicalSanitization`: Ensures technical terms like `C#`, `SQL`, or `API` are formatted correctly. +- `ModuleTechnicalSanitization`: Ensures technical terms are formatted correctly. diff --git a/docs/HISTORY_AND_STATS.md b/docs/HISTORY_AND_STATS.md index 774b221..2d40a22 100644 --- a/docs/HISTORY_AND_STATS.md +++ b/docs/HISTORY_AND_STATS.md @@ -1,10 +1,10 @@ -# History and Stats Implementation Plan +# History and Stats This document outlines the design and implementation of the `history` and `stats` features in Toak. ## Data Storage -All transcriptions will be stored in a JSON Lines (`.jsonl`) file located at `~/.local/share/toak/history.jsonl`. -Since Toak uses Native AOT and JSON serialization needs source generation, we'll keep the model simple. +All transcriptions are stored in a JSON Lines (`.jsonl`) file. +- **Location**: `~/.local/share/toak/history.jsonl` **Entry Model:** ```json @@ -12,32 +12,29 @@ Since Toak uses Native AOT and JSON serialization needs source generation, we'll "Timestamp": "2025-01-15T09:23:00Z", "RawTranscript": "hello world", "RefinedText": "Hello world.", - "SkillName": "Professional", // null if default type/script - "DurationMs": 1500 // time taken for STT + LLM + "SkillName": "Professional", // null if default + "DurationMs": 1500 // total processing time } ``` ## `toak history` Command -Provides access to past dictations. - -- `toak history` - Shows the last 10 entries. -- `toak history -n ` - Shows the last `` entries. -- `toak history --grep ` - Filters the history entries matching the given keyword in the RefinedText (case-insensitive). -- `toak history --export ` - Writes the output as a Markdown file. -- `toak history --shred` - Deletes the `history.jsonl` file entirely. +The CLI provides access to past dictations: +- `toak history`: Shows the last 10 entries. +- `-n `: Shows the last `` entries. +- `--grep `: Case-insensitive search through refined text. +- `--export `: Export history as a Markdown file. +- `--shred`: Securely delete the entire `history.jsonl` file. ## `toak stats` Command -Reads the `history.jsonl` file and outputs usage analytics using `Spectre.Console`. +Aggregates usage metrics from the history file: +- **Total recordings**: Count of all entries. +- **Total duration**: Cumulative time spent transcribing (in minutes). +- **Average latency**: Mean processing time per request (in seconds). +- **Most active day**: Date with the highest number of recordings. +- **Top spoken words**: The 5 most frequent words (>3 characters). -**Metrics:** -- Total recording count -- Total processing duration (sum of `DurationMs`) -- Average processing duration -- Most active day -- Most frequently used skill (if any) - -## Architecture Changes -1. **`HistoryManager.cs`**: Handles thread-safe appending `HistoryEntry` to the `.jsonl` file, reading, and clearing. -2. **`DaemonService.cs`**: Calls `HistoryManager.SaveEntry` during the `ProcessStopRecordingAsync` method after text is finalized. -3. **`HistoryCommand.cs` & `StatsCommand.cs`**: CLI command definitions. -4. **`AppJsonSerializerContext.cs`**: Needs `[JsonSerializable(typeof(HistoryEntry))]`. +## Architecture +1. **`HistoryManager.cs`**: Handles thread-safe appending and reading of the `.jsonl` file. +2. **`TranscriptionOrchestrator.cs`**: Calls `HistoryManager.SaveEntry` after text is finalized. +3. **`HistoryCommand.cs` & `StatsCommand.cs`**: CLI command implementations. +4. **`AppJsonSerializerContext.cs`**: Provides AOT-compatible serialization for `HistoryEntry`. diff --git a/docs/PROTOCOL.md b/docs/PROTOCOL.md index f7c08b2..2e1e44a 100644 --- a/docs/PROTOCOL.md +++ b/docs/PROTOCOL.md @@ -9,48 +9,48 @@ The UNIX domain socket is typically located at: ## Message Format -Clients send small byte arrays to issue commands to the server. Depending on the command, the structure ranges from a single byte to a 3-byte payload containing the command ID and configuration flags for standard output handling. +Clients send small byte arrays (1 to 3 bytes) to issue commands to the server. ### Command Bytes | Command | Byte | Description | |---|---|---| | **START** | `1` | Forces the daemon to start recording. Ignored if already recording. | -| **STOP** | `2` | Forces the daemon to stop recording and begin processing the audio. Takes flags for response handling. | +| **STOP** | `2` | Forces the daemon to stop recording and begin processing. Takes flags. | | **ABORT** | `3` | Stops audio recording and discards the buffer without making API calls. | -| **TOGGLE** | `4` | Stops recording if currently recording; starts recording if currently inactive. Takes flags for response handling. | +| **TOGGLE** | `4` | Stops recording if currently recording; starts recording if inactive. Takes flags. | ## Payload Formats ### 1-Byte Payloads (`START`, `ABORT`) -When the client only needs to trigger state changes without receiving processing results back, it sends a single byte. - +Used for state changes that don't return streaming text. ```text [ Command Byte ] ``` Example (`ABORT`): `[ 0x03 ]` ### 3-Byte Payloads (`STOP`, `TOGGLE`) -When asking the daemon to process audio, the client can specify how it wants to receive the finalized text: typed via hotkeys (default), piped to standard output (`--pipe`), or copied to the clipboard (`--copy`). +When asking the daemon to process audio, the client can specify how it wants to receive the result. The client sends exactly 3 bytes: -The client sends exactly 3 bytes: ```text [ Command Byte ] [ Pipe Flag ] [ Copy Flag ] ``` - **Byte 0:** The command (`0x02` or `0x04`) -- **Byte 1:** Pipe to Stdout: `0x01` if enabled, `0x00` if disabled. -- **Byte 2:** Copy to Clipboard: `0x01` if enabled, `0x00` if disabled. +- **Byte 1:** **Pipe to Stdout**: `0x01` if enabled (client waits for stream), `0x00` if disabled. +- **Byte 2:** **Copy to Clipboard**: `0x01` if enabled, `0x00` if disabled. Example (`TOGGLE` with stdout piping enabled): `[ 0x04, 0x01, 0x00 ]` ## Server Responses -Depending on the flags provided in a 3-Byte Payload: +Depending on the flags: 1. **Default (No flags set):** - The server will process the audio, handle LLM modifications, and inject the text into the user's active window using Wayland (`wtype`) or X11 (`xdotool`). The socket is closed by the server. + The server processes the audio, handles LLM refinedment, and injects the text into the user's active window using the configured backend (`wtype`, `xdotool`, or `ydotool`). The socket is closed by the server. -2. **Pipe or Copy Flag Set:** - The client will keep the connection open and wait to read the incoming text from the server. - The server will stream UTF-8 encoded text chunks back to the client as the LLM generates them. The client reads these chunks and pushes them to `stdout`. Once sending is complete, the server closes the socket. +2. **Pipe Flag Set:** + The client stays connected. The server streams UTF-8 encoded text chunks (tokens) back to the client as they are generated by the LLM. The client writes these to `stdout`. The server closes the socket when finished. + +3. **Copy Flag Set:** + The server handles copying to the system clipboard internally via its `ClipboardManager`. If the Pipe flag is also set, it will stream to stdout simultaneously. diff --git a/docs/STRUCTURE.md b/docs/STRUCTURE.md index 3edef7c..7fcfb5c 100644 --- a/docs/STRUCTURE.md +++ b/docs/STRUCTURE.md @@ -12,43 +12,45 @@ Toak is designed as a fast, Linux-native dictation application utilizing C# AOT Toak/ ├── Api/ │ ├── GroqApiClient.cs # Client for external transcription and LLM API calls (Groq/Whisper) +│ ├── OpenAiCompatibleClient.cs # Generic OpenAI-compatible client for Groq and Together AI │ └── Models/ # API payload representations ├── Assets/ # Sound files or other static resources ├── Audio/ -│ └── AudioRecorder.cs # Handles audio capture via system utilities (e.g., pw-record from PipeWire) +│ ├── AudioRecorder.cs # Handles audio capture via PipeWire (pw-record) +│ └── FfmpegAudioRecorder.cs # Universal audio capture via ffmpeg ├── Commands/ -│ ├── ToggleCommand.cs # Start/stop recording and pass pipe/copy flags -│ ├── DiscardCommand.cs # Abort the current recording -│ ├── OnboardCommand.cs # Initial interactive configuration setup +│ ├── ToggleCommand.cs # Client command to start/stop recording via socket +│ ├── DiscardCommand.cs # Client command to abort current recording +│ ├── OnboardCommand.cs # Interactive configuration setup wizard │ ├── ConfigUpdaterCommand.cs # Direct configuration modifications │ ├── ShowCommand.cs # Display current configuration -│ ├── SkillCommand.cs # CLI controller for discovering and adding Dynamic JSON Skills -│ ├── LatencyTestCommand.cs # Benchmark tool for API calls -│ ├── HistoryCommand.cs # CLI interface to query, export, or shred past transcripts -│ └── StatsCommand.cs # CLI interface to calculate analytics from history +│ ├── SkillCommand.cs # CLI controller for managing JSON Skills +│ ├── LatencyTestCommand.cs # Pipeline benchmark tool +│ ├── HistoryCommand.cs # Interface to query past transcriptions +│ └── StatsCommand.cs # Aggregated usage analytics ├── Configuration/ -│ ├── ConfigManager.cs # Loads and saves JSON configuration from the user's home folder +│ ├── ConfigManager.cs # Loads/saves JSON configuration │ └── ToakConfig.cs # Data model for user preferences ├── Core/ -│ ├── DaemonService.cs # The background daemon maintaining the socket server and handling states -│ ├── Logger.cs # Logging utility (verbose logging) -│ ├── HistoryManager.cs # Manages appending and reading the local history.jsonl -│ ├── HistoryEntry.cs # The data model for transcription history -│ ├── PromptBuilder.cs # Constructs the system prompts for the LLM based on user settings -│ ├── StateTracker.cs # Tracks the current application state (e.g. is recording active?) +│ ├── DaemonService.cs # Background daemon maintaining the socket server +│ ├── TranscriptionOrchestrator.cs # Coordinates audio recording, STT, LLM, and output +│ ├── Logger.cs # Logging utility +│ ├── HistoryManager.cs # Thread-safe history management (.jsonl) +│ ├── HistoryEntry.cs # Data model for transcription history +│ ├── PromptBuilder.cs # Constructs LLM system prompts +│ ├── StateTracker.cs # Tracks application state and recording PIDs +│ ├── Interfaces/ # Core abstractions (ILlmClient, IAudioRecorder, etc.) │ └── Skills/ # Data-driven JSON skill integrations -│ ├── SkillDefinition.cs # JSON Model -│ ├── DynamicSkill.cs # Runtime implementation mapping LLM context to actions -│ └── SkillRegistry.cs # Loads and detects skills from ~/.config/toak/skills/ ├── IO/ -│ ├── ClipboardManager.cs # Cross-session (Wayland/X11) clipboard manipulation (`wl-copy`, `xclip`) -│ ├── TextInjector.cs # Native keyboard injection handling (`wtype`, `xdotool`) -│ └── Notifications.cs # System notifications (`notify-send`) and sound playback (`paplay`) +│ ├── ClipboardManager.cs # Cross-session clipboard manipulation (wl-copy, xclip) +│ ├── TextInjector.cs # Native keyboard injection (wtype, xdotool, ydotool) +│ └── Notifications.cs # System notifications and sound playback ├── Serialization/ -│ └── AppJsonSerializerContext.cs # System.Text.Json source generation context for AOT support +│ └── AppJsonSerializerContext.cs # System.Text.Json source generation for AOT +├── bin/ # Compiler output ├── docs/ # Documentation -├── toak.service # systemd user service file to run the daemon automatically -├── uninstall.sh # Script to completely remove daemon, service, and binaries +├── install.sh # Native AOT build and installation script +├── toak.service # systemd user service definition └── Program.cs # Application entry point using System.CommandLine ``` diff --git a/install.sh b/install.sh index fc94393..81f4092 100755 --- a/install.sh +++ b/install.sh @@ -17,8 +17,8 @@ if [ -d "/usr/share/zsh/site-functions" ]; then fi echo "Installing and starting systemd user service..." -sudo mkdir -p ~/.config/systemd/user -sudo cp toak.service ~/.config/systemd/user/ +mkdir -p ~/.config/systemd/user +cp toak.service ~/.config/systemd/user/ systemctl --user daemon-reload systemctl --user enable --now toak.service diff --git a/uninstall.sh b/uninstall.sh index 2cbe7b6..1b600f1 100755 --- a/uninstall.sh +++ b/uninstall.sh @@ -8,7 +8,7 @@ systemctl --user stop toak.service || true systemctl --user disable toak.service || true echo "Removing systemd service file..." -sudo rm -f ~/.config/systemd/user/toak.service +rm -f ~/.config/systemd/user/toak.service systemctl --user daemon-reload echo "Removing Toak executable..."