diff --git a/Commands/StartCommand.cs b/Commands/StartCommand.cs new file mode 100644 index 0000000..4dcbfc2 --- /dev/null +++ b/Commands/StartCommand.cs @@ -0,0 +1,32 @@ +using System; +using System.Net.Sockets; +using System.Threading.Tasks; +using Spectre.Console; +using Toak.Core; + +namespace Toak.Commands; + +public static class StartCommand +{ + public static async Task ExecuteAsync(bool verbose) + { + Logger.Verbose = verbose; + var socketPath = DaemonService.GetSocketPath(); + try + { + using var socket = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified); + var endPoint = new UnixDomainSocketEndPoint(socketPath); + await socket.ConnectAsync(endPoint); + await socket.SendAsync(new byte[] { 1 }, SocketFlags.None); + if (verbose) Console.WriteLine("Sent START command to daemon."); + } + catch (SocketException) + { + AnsiConsole.MarkupLine("[red]Failed to connect to Toak daemon.[/]"); + } + catch (Exception ex) + { + AnsiConsole.MarkupLine($"[red]Error:[/] {ex.Message}"); + } + } +} diff --git a/Commands/StatusCommand.cs b/Commands/StatusCommand.cs new file mode 100644 index 0000000..3e91b0e --- /dev/null +++ b/Commands/StatusCommand.cs @@ -0,0 +1,46 @@ +using System; +using System.Net.Sockets; +using System.Threading.Tasks; +using Spectre.Console; +using Toak.Core; + +namespace Toak.Commands; + +public static class StatusCommand +{ + public static async Task ExecuteAsync(bool json, bool verbose) + { + Logger.Verbose = verbose; + + var socketPath = DaemonService.GetSocketPath(); + + try + { + using var socket = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified); + var endPoint = new UnixDomainSocketEndPoint(socketPath); + await socket.ConnectAsync(endPoint); + + var msg = new byte[] { 5, (byte)(json ? 1 : 0) }; + await socket.SendAsync(msg, SocketFlags.None); + + var responseBuffer = new byte[4096]; + int received = await socket.ReceiveAsync(responseBuffer, SocketFlags.None); + if (received > 0) + { + var text = System.Text.Encoding.UTF8.GetString(responseBuffer, 0, received); + Console.WriteLine(text); + } + } + catch (SocketException) + { + if (json) + Console.WriteLine("{\"state\": \"Offline\"}"); + else + Console.WriteLine("Offline"); + } + catch (Exception ex) + { + if (!json) AnsiConsole.MarkupLine($"[red]Error:[/] {ex.Message}"); + } + } +} diff --git a/Commands/StopCommand.cs b/Commands/StopCommand.cs new file mode 100644 index 0000000..24b31cf --- /dev/null +++ b/Commands/StopCommand.cs @@ -0,0 +1,46 @@ +using System; +using System.Net.Sockets; +using System.Threading.Tasks; +using Spectre.Console; +using Toak.Core; + +namespace Toak.Commands; + +public static class StopCommand +{ + public static async Task ExecuteAsync(bool pipeToStdout, bool copyToClipboard, bool verbose) + { + Logger.Verbose = verbose; + var socketPath = DaemonService.GetSocketPath(); + try + { + using var socket = new Socket(AddressFamily.Unix, SocketType.Stream, ProtocolType.Unspecified); + var endPoint = new UnixDomainSocketEndPoint(socketPath); + await socket.ConnectAsync(endPoint); + + var msg = new byte[] { 2, (byte)(pipeToStdout ? 1 : 0), (byte)(copyToClipboard ? 1 : 0) }; + await socket.SendAsync(msg, SocketFlags.None); + if (verbose) Console.WriteLine("Sent STOP command to daemon."); + + var responseBuffer = new byte[4096]; + while (true) + { + int received = await socket.ReceiveAsync(responseBuffer, SocketFlags.None); + if (received == 0) break; + if (pipeToStdout) + { + var text = System.Text.Encoding.UTF8.GetString(responseBuffer, 0, received); + Console.Write(text); + } + } + } + catch (SocketException) + { + AnsiConsole.MarkupLine("[red]Failed to connect to Toak daemon.[/]"); + } + catch (Exception ex) + { + AnsiConsole.MarkupLine($"[red]Error:[/] {ex.Message}"); + } + } +} diff --git a/Configuration/ToakConfig.cs b/Configuration/ToakConfig.cs index 653f3bd..b17fb8b 100644 --- a/Configuration/ToakConfig.cs +++ b/Configuration/ToakConfig.cs @@ -9,6 +9,7 @@ public class ToakConfig public string AudioBackend { get; set; } = "pw-record"; // pw-record or ffmpeg public bool ModulePunctuation { get; set; } = true; public bool ModuleTechnicalSanitization { get; set; } = true; + public int MinRecordingDuration { get; set; } = 500; public string WhisperLanguage { get; set; } = string.Empty; public string LlmModel { get; set; } = Toak.Core.Constants.Defaults.LlmModel; diff --git a/Core/DaemonService.cs b/Core/DaemonService.cs index 835cedf..bbb9166 100644 --- a/Core/DaemonService.cs +++ b/Core/DaemonService.cs @@ -140,6 +140,28 @@ public static class DaemonService else await orchestrator.ProcessStartRecordingAsync(); } + else if (cmd == 5) // STATUS + { + bool json = pipeToStdout; // buffer[1] == 1 is json + bool isRecording = stateTracker.IsRecording(); + string stateStr = isRecording ? "Recording" : "Idle"; + + if (json) + { + var start = stateTracker.GetRecordingStartTime(); + double durationMs = 0; + if (isRecording && start.HasValue) + { + durationMs = (DateTime.UtcNow - start.Value).TotalMilliseconds; + } + var jsonStr = $"{{\"state\": \"{stateStr}\", \"duration\": {Math.Round(durationMs)}}}"; + await client.SendAsync(System.Text.Encoding.UTF8.GetBytes(jsonStr), SocketFlags.None); + } + else + { + await client.SendAsync(System.Text.Encoding.UTF8.GetBytes(stateStr), SocketFlags.None); + } + } } } catch (Exception ex) diff --git a/Core/Interfaces/Interfaces.cs b/Core/Interfaces/Interfaces.cs index 97fc2f3..528af5a 100644 --- a/Core/Interfaces/Interfaces.cs +++ b/Core/Interfaces/Interfaces.cs @@ -59,4 +59,5 @@ public interface IRecordingStateTracker void SetRecording(int pid); void ClearRecording(); bool IsRecording(); + DateTime? GetRecordingStartTime(); } diff --git a/Core/StateTracker.cs b/Core/StateTracker.cs index 1f503b4..2332b08 100644 --- a/Core/StateTracker.cs +++ b/Core/StateTracker.cs @@ -14,15 +14,15 @@ public class StateTracker : IRecordingStateTracker public void SetRecording(int ffmpegPid) { Logger.LogDebug($"Setting recording state with PID {ffmpegPid}"); - File.WriteAllText(StateFilePath, ffmpegPid.ToString()); + File.WriteAllText(StateFilePath, $"{ffmpegPid}\n{DateTime.UtcNow.Ticks}"); } public int? GetRecordingPid() { if (File.Exists(StateFilePath)) { - var content = File.ReadAllText(StateFilePath).Trim(); - if (int.TryParse(content, out var pid)) + var lines = File.ReadAllLines(StateFilePath); + if (lines.Length > 0 && int.TryParse(lines[0], out var pid)) { Logger.LogDebug($"Read recording PID {pid} from state file"); return pid; @@ -31,6 +31,19 @@ public class StateTracker : IRecordingStateTracker return null; } + public DateTime? GetRecordingStartTime() + { + if (File.Exists(StateFilePath)) + { + var lines = File.ReadAllLines(StateFilePath); + if (lines.Length > 1 && long.TryParse(lines[1], out var ticks)) + { + return new DateTime(ticks, DateTimeKind.Utc); + } + } + return null; + } + public void ClearRecording() { if (File.Exists(StateFilePath)) diff --git a/Core/TranscriptionOrchestrator.cs b/Core/TranscriptionOrchestrator.cs index 04f952f..0dc1d50 100644 --- a/Core/TranscriptionOrchestrator.cs +++ b/Core/TranscriptionOrchestrator.cs @@ -58,6 +58,19 @@ public class TranscriptionOrchestrator : ITranscriptionOrchestrator Logger.LogDebug("Received STOP command"); var config = _configProvider.LoadConfig(); + + var startTime = _stateTracker.GetRecordingStartTime(); + if (startTime.HasValue) + { + var duration = (DateTime.UtcNow - startTime.Value).TotalMilliseconds; + if (duration < config.MinRecordingDuration) + { + Logger.LogDebug($"Recording duration {duration}ms is less than min {config.MinRecordingDuration}ms. Discarding."); + ProcessAbortAsync(); + return; + } + } + _notifications.PlaySound(config.StopSoundPath); _notifications.Notify("Toak", "Transcribing..."); diff --git a/IDEAS.md b/IDEAS.md index 98d04f9..c90b968 100644 --- a/IDEAS.md +++ b/IDEAS.md @@ -4,282 +4,12 @@ A curated list of CLI-native features to enhance the dictation workflow. --- -## Core Workflow Additions - -### `toak history [-n N]` -Display recent transcriptions with timestamps. Use `-n 1` to replay the last result. - -**Use case:** -- `toak history` - Show last 10 transcriptions -- `toak history -n 5` - Show last 5 -- `toak history -n 1` - Show most recent (equivalent to a "last" command) - -**Storage:** Append to `~/.local/share/toak/history.jsonl` on each successful transcription: -```json -{"timestamp":"2025-01-15T09:23:00Z","raw":"hello world","refined":"Hello world."} -``` - ---- - - - -## Configuration Profiles - -### `toak profile ` / `toak profile` -Switch between prompt presets instantly. - -**Built-in profiles:** -- `default` - Current behavior -- `code` - Technical mode: preserves indentation, brackets, camelCase -- `email` - Professional mode with formal tone -- `notes` - Concise mode, bullet points enabled -- `social` - Casual mode, emoji allowed - -**Usage:** -```bash -toak profile code # Switch to code preset -toak profile # Show current profile -toak profiles # List available profiles -``` - -**Storage:** `~/.config/toak/profiles/.json` - Each file is a complete `ToakConfig` override. - ---- - - - -## History Management - -### `toak stats` -Display usage statistics and analytics. - -```bash -$ toak stats -Total recordings: 342 -Total duration: 4h 23m -Average length: 45s -Most active day: 2025-01-10 (23 recordings) -Top words: "implementation", "refactor", "meeting" -``` - -**Metrics tracked:** -- Total recordings count -- Total/average/min/max duration -- Daily/weekly activity -- Most common words (from refined text) -- API usage estimates - ---- - -### `toak history --export ` -Export transcription history to various formats. - -```bash -toak history --export notes.md # Markdown format -toak history --export log.txt # Plain text -toak history --export data.json # Full JSON dump -``` - -**Markdown format example:** -```markdown -# Toak Transcriptions - 2025-01-15 - -## 09:23:00 -We need to fix the API endpoint. - -## 09:45:12 -- Review the pull request -- Update documentation -``` - ---- - -### `toak history --grep ` -Search through transcription history. - -```bash -toak history --grep "API" # Find all mentions of API -toak history --grep "TODO" -n 5 # Last 5 occurrences of "TODO" -toak history --grep "refactor" --raw # Search raw transcripts instead -``` - -**Output format:** -``` -2025-01-15 09:23:00 We need to fix the API endpoint. -2025-01-15 14:12:33 The API response time is too slow. -``` - ---- - -### `toak history --shred` -Securely delete transcription history. - -```bash -toak history --shred # Delete entire history file -toak history --shred -n 5 # Delete last 5 entries only -toak history --shred --raw # Also delete archived raw audio files -``` - -**Security:** Overwrites data before deletion (optional), removes from disk. - ---- - -## Advanced Architecture - -### `toak daemon` / `toak stop-daemon` -Background service mode for reduced latency. The CLI interface stays identical, but work is offloaded to a persistent process. - -**Architecture:** -``` -┌─────────────┐ Unix Socket ┌─────────────────────────────┐ -│ toak CLI │ ───────────────────► │ toakd │ -│ (client) │ │ (background daemon) │ -│ Exits │ ◄──── Ack + Exit ──── │ - Long-running process │ -│ Instantly │ │ - Hot HttpClient pool │ -└─────────────┘ │ - Config cached in memory │ - │ - Manages ffmpeg lifecycle │ - └─────────────────────────────┘ -``` - -**CLI stays the same:** -```bash -toak toggle # Client sends "start" to daemon, exits (~10ms) -# ... recording happens ... -toak toggle # Client sends "stop" to daemon, exits (~10ms) - # Daemon continues: upload → transcribe → refine → type -``` - -**Why it's faster (without AOT):** - -| Operation | Current | Daemon | Savings | -|-----------|---------|--------|---------| -| JIT compilation | 150ms | 0ms | 150ms | -| Assembly loading | 50ms | 0ms | 50ms | -| DNS lookup | 40ms | 0ms | 40ms | -| TLS handshake | 80ms | 0ms | 80ms | -| Config read | 10ms | 0ms | 10ms | -| **Total** | **~330ms** | **~10ms** | **~320ms** | - -**Why it's still faster (with AOT):** - -AOT eliminates JIT/assembly overhead, but not everything: - -| Operation | AOT Binary | AOT Daemon | Savings | -|-----------|------------|------------|---------| -| Process startup | 20ms | 0ms | 20ms | -| DNS lookup | 40ms | 0ms | 40ms | -| TLS handshake | 80ms | 0ms | 80ms | -| Config read | 5ms | 0ms | 5ms | -| **Total** | **~145ms** | **~10ms** | **~135ms** | - -**Verdict with AOT:** -- Without daemon: Each toggle takes ~145ms before network call starts -- With daemon: Each toggle takes ~10ms (just socket IPC) -- The daemon still saves ~135ms, but it's less critical than without AOT - -**Trade-offs:** -- **Pro:** Faster hotkey response, persistent connections, shared state -- **Con:** Added complexity (process management, crash recovery, socket IPC) -- **Con:** Debugging harder when logic lives in daemon - -**Usage:** -```bash -toak daemon # Start background service -toak stop-daemon # Shutdown background service -toak status # Check if daemon is running -``` - -**Implementation notes:** -- Socket path: `/tmp/toakd.sock` or `$XDG_RUNTIME_DIR/toakd.sock` -- Protocol: Simple line-based or JSON messages -- Daemon writes PID to `/tmp/toakd.pid` for status checks -- Client binary checks for daemon on startup; can auto-start or error - ---- - -## Future Innovations - -### Hotword Commands (LLM Routing) -Instruct the LLM in `PromptBuilder` to output a specific JSON structure if given a command phrase. If a specific hotword like "System command" or "Computer dictate" is detected at the start of the audio, Toak parses the JSON, skips typng out via `xdotool`/`wtype`, and instead executes a pre-defined background action. - -If it doesn't hear a command phrase, it simply returns the text normally and types it. - -**How it works (Under the Hood):** -The LLM is prompted to always return JSON in the background when a command is directed at the assistant. -```json -{ - "is_command": true, - "action": "append_to_notes", - "content": "Buy milk and eggs", - "meta": {} -} -``` - -**Alternative Hotword Ideas:** -Since "Toak" is not a real English word, Whisper might transcribe it as "talk", "toke", or "oak." It is highly recommended to use distinct, phonetically clear hotwords such as: -- **"System..."** (e.g. "System note:") -- **"Computer..."** (e.g. "Computer search:") -- **"Action..."** (e.g. "Action commit:") -- **"Dictate..."** (e.g. "Dictate terminal:") -- **"Listen up..."** (e.g. "Listen up translate...") - -**Prompt Ideas & Use Cases:** - -1. **Quick Notes / Brainstorming:** - - *Hotword:* `"System note:"` or `"Drop this in my notes:"` - - *Action:* Appends the spoken text to a configured `~/notes.md` file in the background without interrupting your current window. - - *Example:* "System note: I need to remember to check the database migrations later today." - -2. **Terminal / CLI Execution:** - - *Hotword:* `"Computer terminal:"` or `"Command:"` - - *Action:* Takes the natural language command, asks the LLM to translate it into a bash command, and types it into a new tmux window or background process. - - *Example:* "Computer terminal: find all python files modified in the last 2 days." - -3. **Git Commit Messages:** - - *Hotword:* `"Action commit:"` - - *Action:* Automatically formats the dictated text into a standard conventional commit message, stages all files, and commits them. - - *Example:* "Action commit: I refactored the audio recorder to use native processes instead of the old library." -> LLM outputs `refactor(audio): migrate to native processes` and runs `git commit -am "..."`. - -4. **Web Search / Lookup:** - - *Hotword:* `"System search:"` or `"Look up:"` - - *Action:* Opens your default browser and performs a search for the spoken phrase. - - *Example:* "System search: MDN documentation for grid layout." - -5. **Translating on the fly:** - - *Hotword:* `"Translate to Spanish:"` - - *Action:* Instead of typing English, it types the translated version of the rest of the sentence. - - *Example:* "Translate to Spanish: Hello, how are you today?" -> Types out `Hola, ¿cómo estás hoy?`. - ---- - -## Implementation Priority - -### Tier 1: High Impact, Low Effort -*(All Tier 1 items have been implemented!)* - -### Tier 2: Medium Effort (Requires History Storage) -4. `toak history` with `--export`, `--grep`, `--shred` flags -5. `toak stats` - Analytics aggregation -6. `toak copy` - Clipboard integration - -### Tier 3: Higher Complexity -7. `toak profile` - Config presets -8. `toak daemon` - Background service architecture - ---- - -## Technical Notes - -**History Storage:** -- Use JSON Lines format (`.jsonl`) for append-only log -- Rotate at 5000 entries or 30 days -- Store both raw and refined text for debugging - - -**Pipe Detection in C#:** -```csharp -if (Console.IsOutputRedirected || args.Contains("--pipe")) -{ - Console.WriteLine(refinedText); -} -``` +## 2. Bring Your Own Script (BYOS) Window Context +Allow the LLM's system prompt to dynamically adapt based on the currently focused application (e.g., formatting as code for `alacritty`, or conversational for `discord`). +* **Implementation:** Add a `ContextCommand` string to `ToakConfig.cs` (e.g., `~/.config/toak/get-window.sh`). Toak executes this script via `Process.Start` and appends the `stdout` result to the LLM prompt. +* **Rationale:** Avoids bloating the tool with integrations for dozens of display managers and window managers. Users write their own one-liners (`hyprctl activewindow`, `xprop`, etc.), keeping Toak lean. + +## 3. In-Place Editing via Primary Selection +Transform Toak from a pure dictation tool into an inline text-editing copilot by reading the Linux "Primary" clipboard (currently highlighted text). +* **Implementation:** Before sending the prompt to the LLM, attempt to read the primary selection via `wl-paste -p` (Wayland) or `xclip -o -selection primary` (X11). If text exists, pass it to the LLM as context alongside the dictated audio. +* **Rationale:** Allows users to highlight a paragraph, hit the dictation key, and say "Translate this to Spanish" or "Make this sound more professional," replacing the highlighted text via the typing backend. diff --git a/Program.cs b/Program.cs index 85a185e..417de87 100644 --- a/Program.cs +++ b/Program.cs @@ -23,6 +23,25 @@ public class Program toggleCmd.SetHandler(ToggleCommand.ExecuteAsync, pipeOption, copyOption, verboseOption); rootCommand.AddCommand(toggleCmd); + // Start Command + var startCmd = new Command("start", "Explicitly starts the recording"); + startCmd.SetHandler(StartCommand.ExecuteAsync, verboseOption); + rootCommand.AddCommand(startCmd); + + // Stop Command + var stopCmd = new Command("stop", "Explicitly stops the recording"); + stopCmd.AddOption(pipeOption); + stopCmd.AddOption(copyOption); + stopCmd.SetHandler(StopCommand.ExecuteAsync, pipeOption, copyOption, verboseOption); + rootCommand.AddCommand(stopCmd); + + // Status Command + var statusCmd = new Command("status", "Outputs the current daemon status"); + var jsonOption = new Option("--json", "Output status as JSON"); + statusCmd.AddOption(jsonOption); + statusCmd.SetHandler(StatusCommand.ExecuteAsync, jsonOption, verboseOption); + rootCommand.AddCommand(statusCmd); + // Daemon Command var daemonCmd = new Command("daemon", "Starts the background service"); daemonCmd.SetHandler(Toak.Core.DaemonService.StartAsync, verboseOption); diff --git a/README.md b/README.md index 0612f36..8d59669 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,9 @@ Toak operates as a persistent **daemon** that manages state and API calls, while ### Core Commands - **`toak toggle`**: The primary command. Start recording; run again to stop, transcribe, and type/copy/stdout the result. +- **`toak start`**: Explicitly start recording. +- **`toak stop`**: Explicitly stop recording. +- **`toak status`**: Output the daemon's current state (`Recording`, `Idle`, or JSON). - **`toak daemon`**: Runs the background service manually (usually managed by systemd). - **`toak onboard`**: Launches the interactive configuration wizard for providers, models, and backends. - **`toak discard`**: Instantly aborts the current recording without performing any transcription. @@ -106,5 +109,6 @@ Key settings in `ToakConfig.cs` (managed via `toak onboard` or `toak config`): - `WhisperLanguage`: Set spoken language (e.g., `en`, `es`, `fr`). - `TypingBackend`: Choose between `wtype`, `xdotool`, or `ydotool`. - `AudioBackend`: Choose between `pw-record` (PipeWire) or `ffmpeg`. +- `MinRecordingDuration`: Set the minimum recording duration in ms (default: `500`). - `ModulePunctuation`: Toggle automatic grammar and punctuation fixing. - `ModuleTechnicalSanitization`: Ensures technical terms are formatted correctly. diff --git a/_toak b/_toak index 5ce8a10..56234b8 100644 --- a/_toak +++ b/_toak @@ -13,6 +13,9 @@ _toak() { commands=( 'toggle:Starts or stops the recording' + 'start:Explicitly starts the recording' + 'stop:Explicitly stops the recording' + 'status:Outputs the current daemon status' 'daemon:Starts the background background service' 'discard:Abort current recording without transcribing' 'onboard:Configure the application' @@ -40,6 +43,19 @@ _toak() { '(-p --pipe)'{-p,--pipe}'[Output transcription to stdout instead of typing]' \ '--copy[Copy to clipboard instead of typing]' ;; + start) + _arguments \ + '(-v --verbose)'{-v,--verbose}'[Enable detailed debug logging]' + ;; + stop) + _arguments \ + '(-p --pipe)'{-p,--pipe}'[Output transcription to stdout instead of typing]' \ + '--copy[Copy to clipboard instead of typing]' + ;; + status) + _arguments \ + '--json[Output status as JSON]' + ;; discard) _arguments \ '(-p --pipe)'{-p,--pipe}'[Output transcription to stdout instead of typing]' diff --git a/docs/PROTOCOL.md b/docs/PROTOCOL.md index 2e1e44a..d0f92cf 100644 --- a/docs/PROTOCOL.md +++ b/docs/PROTOCOL.md @@ -19,6 +19,7 @@ Clients send small byte arrays (1 to 3 bytes) to issue commands to the server. | **STOP** | `2` | Forces the daemon to stop recording and begin processing. Takes flags. | | **ABORT** | `3` | Stops audio recording and discards the buffer without making API calls. | | **TOGGLE** | `4` | Stops recording if currently recording; starts recording if inactive. Takes flags. | +| **STATUS** | `5` | Queries the daemon for its current recording status (idle or recording). Takes a JSON flag. | ## Payload Formats @@ -27,7 +28,14 @@ Used for state changes that don't return streaming text. ```text [ Command Byte ] ``` -Example (`ABORT`): `[ 0x03 ]` +Example (`ABORT`): `[ 0x03 ]` + +### 2-Byte Payloads (`STATUS`) +When asking the daemon for its status, the client specifies whether it wants a JSON response. +```text +[ Command Byte ] [ JSON Flag ] +``` +Example (`STATUS` with JSON): `[ 0x05, 0x01 ]` ### 3-Byte Payloads (`STOP`, `TOGGLE`) When asking the daemon to process audio, the client can specify how it wants to receive the result. The client sends exactly 3 bytes: