feat: Implement a modular skill system with hotword detection, streaming text output, and enhanced logging.
This commit is contained in:
@@ -4,6 +4,7 @@ using System.Text.Json.Serialization;
|
|||||||
|
|
||||||
using Toak.Api.Models;
|
using Toak.Api.Models;
|
||||||
using Toak.Serialization;
|
using Toak.Serialization;
|
||||||
|
using Toak.Core;
|
||||||
|
|
||||||
namespace Toak.Api;
|
namespace Toak.Api;
|
||||||
|
|
||||||
@@ -39,7 +40,9 @@ public class GroqApiClient
|
|||||||
content.Add(new StringContent(firstLang), "language");
|
content.Add(new StringContent(firstLang), "language");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Logger.LogDebug($"Sending Whisper API request ({modelToUse})...");
|
||||||
var response = await _httpClient.PostAsync("audio/transcriptions", content);
|
var response = await _httpClient.PostAsync("audio/transcriptions", content);
|
||||||
|
Logger.LogDebug($"Whisper API response status: {response.StatusCode}");
|
||||||
|
|
||||||
if (!response.IsSuccessStatusCode)
|
if (!response.IsSuccessStatusCode)
|
||||||
{
|
{
|
||||||
@@ -67,7 +70,9 @@ public class GroqApiClient
|
|||||||
|
|
||||||
var jsonContent = new StringContent(JsonSerializer.Serialize(requestBody, AppJsonSerializerContext.Default.LlamaRequest), System.Text.Encoding.UTF8, "application/json");
|
var jsonContent = new StringContent(JsonSerializer.Serialize(requestBody, AppJsonSerializerContext.Default.LlamaRequest), System.Text.Encoding.UTF8, "application/json");
|
||||||
|
|
||||||
|
Logger.LogDebug($"Sending Llama API request (model: {requestBody.Model})...");
|
||||||
var response = await _httpClient.PostAsync("chat/completions", jsonContent);
|
var response = await _httpClient.PostAsync("chat/completions", jsonContent);
|
||||||
|
Logger.LogDebug($"Llama API response status: {response.StatusCode}");
|
||||||
|
|
||||||
if (!response.IsSuccessStatusCode)
|
if (!response.IsSuccessStatusCode)
|
||||||
{
|
{
|
||||||
@@ -80,4 +85,55 @@ public class GroqApiClient
|
|||||||
|
|
||||||
return result?.Choices?.FirstOrDefault()?.Message?.Content ?? string.Empty;
|
return result?.Choices?.FirstOrDefault()?.Message?.Content ?? string.Empty;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async IAsyncEnumerable<string> RefineTextStreamAsync(string rawTranscript, string systemPrompt, string model = "openai/gpt-oss-20b")
|
||||||
|
{
|
||||||
|
var requestBody = new LlamaRequest
|
||||||
|
{
|
||||||
|
Model = string.IsNullOrWhiteSpace(model) ? "openai/gpt-oss-20b" : model,
|
||||||
|
Temperature = 0.0,
|
||||||
|
Stream = true,
|
||||||
|
Messages = new[]
|
||||||
|
{
|
||||||
|
new LlamaRequestMessage { Role = "system", Content = systemPrompt },
|
||||||
|
new LlamaRequestMessage { Role = "user", Content = $"<transcript>{rawTranscript}</transcript>" }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
var jsonContent = new StringContent(JsonSerializer.Serialize(requestBody, AppJsonSerializerContext.Default.LlamaRequest), System.Text.Encoding.UTF8, "application/json");
|
||||||
|
|
||||||
|
using var request = new HttpRequestMessage(HttpMethod.Post, "chat/completions") { Content = jsonContent };
|
||||||
|
request.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("text/event-stream"));
|
||||||
|
|
||||||
|
Logger.LogDebug($"Sending Llama Steam API request (model: {requestBody.Model})...");
|
||||||
|
using var response = await _httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead);
|
||||||
|
Logger.LogDebug($"Llama Stream API response status: {response.StatusCode}");
|
||||||
|
|
||||||
|
if (!response.IsSuccessStatusCode)
|
||||||
|
{
|
||||||
|
var error = await response.Content.ReadAsStringAsync();
|
||||||
|
throw new Exception($"Llama API Error: {response.StatusCode} - {error}");
|
||||||
|
}
|
||||||
|
|
||||||
|
using var stream = await response.Content.ReadAsStreamAsync();
|
||||||
|
using var reader = new StreamReader(stream);
|
||||||
|
|
||||||
|
string? line;
|
||||||
|
while ((line = await reader.ReadLineAsync()) != null)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(line)) continue;
|
||||||
|
if (line.StartsWith("data: "))
|
||||||
|
{
|
||||||
|
var data = line.Substring("data: ".Length).Trim();
|
||||||
|
if (data == "[DONE]") break;
|
||||||
|
|
||||||
|
var chunk = JsonSerializer.Deserialize(data, AppJsonSerializerContext.Default.LlamaStreamResponse);
|
||||||
|
var content = chunk?.Choices?.FirstOrDefault()?.Delta?.Content;
|
||||||
|
if (!string.IsNullOrEmpty(content))
|
||||||
|
{
|
||||||
|
yield return content;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -18,6 +18,8 @@ public class LlamaRequest
|
|||||||
public LlamaRequestMessage[] Messages { get; set; } = Array.Empty<LlamaRequestMessage>();
|
public LlamaRequestMessage[] Messages { get; set; } = Array.Empty<LlamaRequestMessage>();
|
||||||
[JsonPropertyName("temperature")]
|
[JsonPropertyName("temperature")]
|
||||||
public double Temperature { get; set; } = 0.0;
|
public double Temperature { get; set; } = 0.0;
|
||||||
|
[JsonPropertyName("stream")]
|
||||||
|
public bool? Stream { get; set; }
|
||||||
}
|
}
|
||||||
|
|
||||||
public class LlamaResponse
|
public class LlamaResponse
|
||||||
@@ -31,3 +33,22 @@ public class LlamaChoice
|
|||||||
[JsonPropertyName("message")]
|
[JsonPropertyName("message")]
|
||||||
public LlamaRequestMessage Message { get; set; } = new();
|
public LlamaRequestMessage Message { get; set; } = new();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public class LlamaStreamResponse
|
||||||
|
{
|
||||||
|
[JsonPropertyName("choices")]
|
||||||
|
public LlamaStreamChoice[] Choices { get; set; } = Array.Empty<LlamaStreamChoice>();
|
||||||
|
}
|
||||||
|
|
||||||
|
public class LlamaStreamChoice
|
||||||
|
{
|
||||||
|
[JsonPropertyName("delta")]
|
||||||
|
public LlamaStreamDelta Delta { get; set; } = new();
|
||||||
|
}
|
||||||
|
|
||||||
|
public class LlamaStreamDelta
|
||||||
|
{
|
||||||
|
[JsonPropertyName("content")]
|
||||||
|
public string? Content { get; set; }
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
BIN
Assets/Audio/beep.wav
Normal file
BIN
Assets/Audio/beep.wav
Normal file
Binary file not shown.
@@ -15,9 +15,12 @@ public static class AudioRecorder
|
|||||||
{
|
{
|
||||||
if (File.Exists(WavPath))
|
if (File.Exists(WavPath))
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Deleting old audio file: {WavPath}");
|
||||||
File.Delete(WavPath);
|
File.Delete(WavPath);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Logger.LogDebug("Starting ffmpeg to record audio...");
|
||||||
|
|
||||||
var pInfo = new ProcessStartInfo
|
var pInfo = new ProcessStartInfo
|
||||||
{
|
{
|
||||||
FileName = "ffmpeg",
|
FileName = "ffmpeg",
|
||||||
@@ -41,6 +44,7 @@ public static class AudioRecorder
|
|||||||
var pid = StateTracker.GetRecordingPid();
|
var pid = StateTracker.GetRecordingPid();
|
||||||
if (pid.HasValue)
|
if (pid.HasValue)
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Found active recording process with PID {pid.Value}. Attempting to stop...");
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
var process = Process.GetProcessById(pid.Value);
|
var process = Process.GetProcessById(pid.Value);
|
||||||
|
|||||||
@@ -13,4 +13,7 @@ public class ToakConfig
|
|||||||
public string WhisperLanguage { get; set; } = string.Empty;
|
public string WhisperLanguage { get; set; } = string.Empty;
|
||||||
public string LlmModel { get; set; } = "openai/gpt-oss-20b";
|
public string LlmModel { get; set; } = "openai/gpt-oss-20b";
|
||||||
public string WhisperModel { get; set; } = "whisper-large-v3-turbo";
|
public string WhisperModel { get; set; } = "whisper-large-v3-turbo";
|
||||||
|
public string StartSoundPath { get; set; } = "Assets/Audio/beep.wav";
|
||||||
|
public string StopSoundPath { get; set; } = "Assets/Audio/beep.wav";
|
||||||
|
public List<string> ActiveSkills { get; set; } = new List<string> { "Terminal", "Translate" };
|
||||||
}
|
}
|
||||||
|
|||||||
15
Core/Logger.cs
Normal file
15
Core/Logger.cs
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
namespace Toak.Core;
|
||||||
|
|
||||||
|
public static class Logger
|
||||||
|
{
|
||||||
|
public static bool Verbose { get; set; } = false;
|
||||||
|
|
||||||
|
public static void LogDebug(string message)
|
||||||
|
{
|
||||||
|
if (Verbose)
|
||||||
|
{
|
||||||
|
var logLine = $"[DEBUG] {DateTime.Now:HH:mm:ss.fff} - {message}";
|
||||||
|
Console.WriteLine(logLine);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -23,6 +23,7 @@ public static class PromptBuilder
|
|||||||
sb.AppendLine();
|
sb.AppendLine();
|
||||||
sb.AppendLine("FORMATTING RULES:");
|
sb.AppendLine("FORMATTING RULES:");
|
||||||
sb.AppendLine("- CRITICAL: If the <transcript> contains nothing, or very short gibberish, output NOTHING AT ALL (an empty string).");
|
sb.AppendLine("- CRITICAL: If the <transcript> contains nothing, or very short gibberish, output NOTHING AT ALL (an empty string).");
|
||||||
|
sb.AppendLine("- LANGUAGE DETECT: The transcript may be in English or a different language (e.g., Hungarian, Spanish). Detect the language and ensure your output and grammar corrections are STRICTLY in that same language.");
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
13
Core/Skills/ISkill.cs
Normal file
13
Core/Skills/ISkill.cs
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
namespace Toak.Core.Skills;
|
||||||
|
|
||||||
|
public interface ISkill
|
||||||
|
{
|
||||||
|
string Name { get; }
|
||||||
|
string Description { get; }
|
||||||
|
string[] Hotwords { get; }
|
||||||
|
|
||||||
|
bool HandlesExecution { get; }
|
||||||
|
|
||||||
|
string GetSystemPrompt(string rawTranscript);
|
||||||
|
void Execute(string llmResult);
|
||||||
|
}
|
||||||
29
Core/Skills/SkillRegistry.cs
Normal file
29
Core/Skills/SkillRegistry.cs
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
namespace Toak.Core.Skills;
|
||||||
|
|
||||||
|
public static class SkillRegistry
|
||||||
|
{
|
||||||
|
public static readonly ISkill[] AllSkills = new ISkill[]
|
||||||
|
{
|
||||||
|
new TerminalSkill(),
|
||||||
|
new TranslateSkill()
|
||||||
|
};
|
||||||
|
|
||||||
|
public static ISkill? DetectSkill(string transcript, IEnumerable<string> activeSkillNames)
|
||||||
|
{
|
||||||
|
var activeSkills = AllSkills.Where(s => activeSkillNames.Contains(s.Name, StringComparer.OrdinalIgnoreCase)).ToList();
|
||||||
|
|
||||||
|
string normalizedTranscript = transcript.Trim();
|
||||||
|
|
||||||
|
foreach (var skill in activeSkills)
|
||||||
|
{
|
||||||
|
foreach (var hotword in skill.Hotwords)
|
||||||
|
{
|
||||||
|
if (normalizedTranscript.StartsWith(hotword, StringComparison.OrdinalIgnoreCase))
|
||||||
|
{
|
||||||
|
return skill;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
41
Core/Skills/TerminalSkill.cs
Normal file
41
Core/Skills/TerminalSkill.cs
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
using System.Diagnostics;
|
||||||
|
|
||||||
|
namespace Toak.Core.Skills;
|
||||||
|
|
||||||
|
public class TerminalSkill : ISkill
|
||||||
|
{
|
||||||
|
public string Name => "Terminal";
|
||||||
|
public string Description => "Translates an intent into a bash command and runs it in the background.";
|
||||||
|
public string[] Hotwords => new[] { "System terminal", "System command" };
|
||||||
|
|
||||||
|
public bool HandlesExecution => true;
|
||||||
|
|
||||||
|
public string GetSystemPrompt(string rawTranscript)
|
||||||
|
{
|
||||||
|
return @"You are a command-line assistant. The user will ask you to perform a task.
|
||||||
|
Translate the request into a single bash command.
|
||||||
|
Output ONLY the raw bash command to achieve this task. Do not include markdown formatting, backticks, or explanations.";
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Execute(string llmResult)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
Console.WriteLine($"[TerminalSkill] Executing: {llmResult}");
|
||||||
|
var escapedCmd = llmResult.Replace("\"", "\\\"");
|
||||||
|
var pInfo = new ProcessStartInfo
|
||||||
|
{
|
||||||
|
FileName = "bash",
|
||||||
|
Arguments = $"-c \"{escapedCmd}\"",
|
||||||
|
UseShellExecute = false,
|
||||||
|
CreateNoWindow = true
|
||||||
|
};
|
||||||
|
Process.Start(pInfo);
|
||||||
|
IO.Notifications.Notify("Toak Terminal Executed", llmResult);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"[TerminalSkill Error] {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
23
Core/Skills/TranslateSkill.cs
Normal file
23
Core/Skills/TranslateSkill.cs
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
namespace Toak.Core.Skills;
|
||||||
|
|
||||||
|
public class TranslateSkill : ISkill
|
||||||
|
{
|
||||||
|
public string Name => "Translate";
|
||||||
|
public string Description => "Translates the spoken text into another language on the fly.";
|
||||||
|
public string[] Hotwords => new[] { "System translate to", "System translate into" };
|
||||||
|
|
||||||
|
public bool HandlesExecution => false;
|
||||||
|
|
||||||
|
public string GetSystemPrompt(string rawTranscript)
|
||||||
|
{
|
||||||
|
return @"You are an expert translator. The user wants to translate the following text.
|
||||||
|
The first few words identify the target language (e.g. 'Translate to Spanish:', 'Translate into Hungarian:').
|
||||||
|
Translate the REST of the transcript into that target language.
|
||||||
|
Output ONLY the final translated text. Do not include markdown, explanations, or quotes.";
|
||||||
|
}
|
||||||
|
|
||||||
|
public void Execute(string llmResult)
|
||||||
|
{
|
||||||
|
// Not used since HandlesExecution is false
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -11,6 +11,7 @@ public static class StateTracker
|
|||||||
|
|
||||||
public static void SetRecording(int ffmpegPid)
|
public static void SetRecording(int ffmpegPid)
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Setting recording state with PID {ffmpegPid}");
|
||||||
File.WriteAllText(StateFilePath, ffmpegPid.ToString());
|
File.WriteAllText(StateFilePath, ffmpegPid.ToString());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -21,6 +22,7 @@ public static class StateTracker
|
|||||||
var content = File.ReadAllText(StateFilePath).Trim();
|
var content = File.ReadAllText(StateFilePath).Trim();
|
||||||
if (int.TryParse(content, out var pid))
|
if (int.TryParse(content, out var pid))
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Read recording PID {pid} from state file");
|
||||||
return pid;
|
return pid;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -31,6 +33,7 @@ public static class StateTracker
|
|||||||
{
|
{
|
||||||
if (File.Exists(StateFilePath))
|
if (File.Exists(StateFilePath))
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug("Clearing recording state file");
|
||||||
File.Delete(StateFilePath);
|
File.Delete(StateFilePath);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
55
IDEAS.md
55
IDEAS.md
@@ -197,6 +197,61 @@ toak status # Check if daemon is running
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Future Innovations
|
||||||
|
|
||||||
|
### Hotword Commands (LLM Routing)
|
||||||
|
Instruct the LLM in `PromptBuilder` to output a specific JSON structure if given a command phrase. If a specific hotword like "System command" or "Computer dictate" is detected at the start of the audio, Toak parses the JSON, skips typng out via `xdotool`/`wtype`, and instead executes a pre-defined background action.
|
||||||
|
|
||||||
|
If it doesn't hear a command phrase, it simply returns the text normally and types it.
|
||||||
|
|
||||||
|
**How it works (Under the Hood):**
|
||||||
|
The LLM is prompted to always return JSON in the background when a command is directed at the assistant.
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"is_command": true,
|
||||||
|
"action": "append_to_notes",
|
||||||
|
"content": "Buy milk and eggs",
|
||||||
|
"meta": {}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
**Alternative Hotword Ideas:**
|
||||||
|
Since "Toak" is not a real English word, Whisper might transcribe it as "talk", "toke", or "oak." It is highly recommended to use distinct, phonetically clear hotwords such as:
|
||||||
|
- **"System..."** (e.g. "System note:")
|
||||||
|
- **"Computer..."** (e.g. "Computer search:")
|
||||||
|
- **"Action..."** (e.g. "Action commit:")
|
||||||
|
- **"Dictate..."** (e.g. "Dictate terminal:")
|
||||||
|
- **"Listen up..."** (e.g. "Listen up translate...")
|
||||||
|
|
||||||
|
**Prompt Ideas & Use Cases:**
|
||||||
|
|
||||||
|
1. **Quick Notes / Brainstorming:**
|
||||||
|
- *Hotword:* `"System note:"` or `"Drop this in my notes:"`
|
||||||
|
- *Action:* Appends the spoken text to a configured `~/notes.md` file in the background without interrupting your current window.
|
||||||
|
- *Example:* "System note: I need to remember to check the database migrations later today."
|
||||||
|
|
||||||
|
2. **Terminal / CLI Execution:**
|
||||||
|
- *Hotword:* `"Computer terminal:"` or `"Command:"`
|
||||||
|
- *Action:* Takes the natural language command, asks the LLM to translate it into a bash command, and types it into a new tmux window or background process.
|
||||||
|
- *Example:* "Computer terminal: find all python files modified in the last 2 days."
|
||||||
|
|
||||||
|
3. **Git Commit Messages:**
|
||||||
|
- *Hotword:* `"Action commit:"`
|
||||||
|
- *Action:* Automatically formats the dictated text into a standard conventional commit message, stages all files, and commits them.
|
||||||
|
- *Example:* "Action commit: I refactored the audio recorder to use native processes instead of the old library." -> LLM outputs `refactor(audio): migrate to native processes` and runs `git commit -am "..."`.
|
||||||
|
|
||||||
|
4. **Web Search / Lookup:**
|
||||||
|
- *Hotword:* `"System search:"` or `"Look up:"`
|
||||||
|
- *Action:* Opens your default browser and performs a search for the spoken phrase.
|
||||||
|
- *Example:* "System search: MDN documentation for grid layout."
|
||||||
|
|
||||||
|
5. **Translating on the fly:**
|
||||||
|
- *Hotword:* `"Translate to Spanish:"`
|
||||||
|
- *Action:* Instead of typing English, it types the translated version of the rest of the sentence.
|
||||||
|
- *Example:* "Translate to Spanish: Hello, how are you today?" -> Types out `Hola, ¿cómo estás hoy?`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## Implementation Priority
|
## Implementation Priority
|
||||||
|
|
||||||
### Tier 1: High Impact, Low Effort
|
### Tier 1: High Impact, Low Effort
|
||||||
|
|||||||
@@ -22,4 +22,47 @@ public static class Notifications
|
|||||||
Console.WriteLine($"[Notifications] Failed to send notification: {ex.Message}");
|
Console.WriteLine($"[Notifications] Failed to send notification: {ex.Message}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static void PlaySound(string soundPath)
|
||||||
|
{
|
||||||
|
if (string.IsNullOrWhiteSpace(soundPath)) return;
|
||||||
|
try
|
||||||
|
{
|
||||||
|
var absolutePath = soundPath;
|
||||||
|
if (!Path.IsPathRooted(absolutePath))
|
||||||
|
absolutePath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, absolutePath);
|
||||||
|
|
||||||
|
if (!File.Exists(absolutePath))
|
||||||
|
{
|
||||||
|
var resourceName = "Toak." + soundPath.Replace("/", ".").Replace("\\", ".");
|
||||||
|
using var stream = System.Reflection.Assembly.GetExecutingAssembly().GetManifestResourceStream(resourceName);
|
||||||
|
if (stream != null)
|
||||||
|
{
|
||||||
|
absolutePath = Path.Combine(Path.GetTempPath(), "toak_" + Path.GetFileName(soundPath));
|
||||||
|
if (!File.Exists(absolutePath))
|
||||||
|
{
|
||||||
|
using var fileStream = File.Create(absolutePath);
|
||||||
|
stream.CopyTo(fileStream);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
var pInfo = new ProcessStartInfo
|
||||||
|
{
|
||||||
|
FileName = "paplay",
|
||||||
|
Arguments = $"\"{absolutePath}\"",
|
||||||
|
UseShellExecute = false,
|
||||||
|
CreateNoWindow = true
|
||||||
|
};
|
||||||
|
Process.Start(pInfo);
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"[Notifications] Failed to play sound: {ex.Message}");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,11 +1,14 @@
|
|||||||
using System.Diagnostics;
|
using System.Diagnostics;
|
||||||
|
|
||||||
|
using Toak.Core;
|
||||||
|
|
||||||
namespace Toak.IO;
|
namespace Toak.IO;
|
||||||
|
|
||||||
public static class TextInjector
|
public static class TextInjector
|
||||||
{
|
{
|
||||||
public static void Inject(string text, string backend)
|
public static void Inject(string text, string backend)
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Injecting text: '{text}' with {backend}");
|
||||||
if (string.IsNullOrWhiteSpace(text)) return;
|
if (string.IsNullOrWhiteSpace(text)) return;
|
||||||
|
|
||||||
try
|
try
|
||||||
@@ -13,6 +16,7 @@ public static class TextInjector
|
|||||||
ProcessStartInfo pInfo;
|
ProcessStartInfo pInfo;
|
||||||
if (backend.ToLowerInvariant() == "wtype")
|
if (backend.ToLowerInvariant() == "wtype")
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Injecting text using wtype...");
|
||||||
pInfo = new ProcessStartInfo
|
pInfo = new ProcessStartInfo
|
||||||
{
|
{
|
||||||
FileName = "wtype",
|
FileName = "wtype",
|
||||||
@@ -23,6 +27,7 @@ public static class TextInjector
|
|||||||
}
|
}
|
||||||
else // xdotool
|
else // xdotool
|
||||||
{
|
{
|
||||||
|
Logger.LogDebug($"Injecting text using xdotool...");
|
||||||
pInfo = new ProcessStartInfo
|
pInfo = new ProcessStartInfo
|
||||||
{
|
{
|
||||||
FileName = "xdotool",
|
FileName = "xdotool",
|
||||||
@@ -40,4 +45,57 @@ public static class TextInjector
|
|||||||
Notifications.Notify("Injection Error", "Could not type text into window.");
|
Notifications.Notify("Injection Error", "Could not type text into window.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static async Task InjectStreamAsync(IAsyncEnumerable<string> tokenStream, string backend)
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
ProcessStartInfo pInfo;
|
||||||
|
if (backend.ToLowerInvariant() == "wtype")
|
||||||
|
{
|
||||||
|
Logger.LogDebug($"Setting up stream injection using wtype...");
|
||||||
|
pInfo = new ProcessStartInfo
|
||||||
|
{
|
||||||
|
FileName = "wtype",
|
||||||
|
Arguments = "-",
|
||||||
|
UseShellExecute = false,
|
||||||
|
CreateNoWindow = true,
|
||||||
|
RedirectStandardInput = true
|
||||||
|
};
|
||||||
|
}
|
||||||
|
else // xdotool
|
||||||
|
{
|
||||||
|
Logger.LogDebug($"Setting up stream injection using xdotool...");
|
||||||
|
pInfo = new ProcessStartInfo
|
||||||
|
{
|
||||||
|
FileName = "xdotool",
|
||||||
|
Arguments = "type --clearmodifiers --delay 0 --file -",
|
||||||
|
UseShellExecute = false,
|
||||||
|
CreateNoWindow = true,
|
||||||
|
RedirectStandardInput = true
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
using var process = Process.Start(pInfo);
|
||||||
|
if (process == null) return;
|
||||||
|
|
||||||
|
Logger.LogDebug("Started stream injection process, waiting for tokens...");
|
||||||
|
|
||||||
|
await foreach (var token in tokenStream)
|
||||||
|
{
|
||||||
|
Logger.LogDebug($"Injecting token: '{token}'");
|
||||||
|
await process.StandardInput.WriteAsync(token);
|
||||||
|
await process.StandardInput.FlushAsync();
|
||||||
|
}
|
||||||
|
|
||||||
|
Logger.LogDebug("Stream injection complete. Closing standard input.");
|
||||||
|
process.StandardInput.Close();
|
||||||
|
await process.WaitForExitAsync();
|
||||||
|
}
|
||||||
|
catch (Exception ex)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"[TextInjector] Error injecting text stream: {ex.Message}");
|
||||||
|
Notifications.Notify("Injection Error", "Could not type text stream into window.");
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
73
Program.cs
73
Program.cs
@@ -5,8 +5,11 @@ using Toak.Api;
|
|||||||
using Toak.Core;
|
using Toak.Core;
|
||||||
using Toak.IO;
|
using Toak.IO;
|
||||||
|
|
||||||
bool pipeToStdout = args.Contains("--pipe") || args.Contains("-p") || Console.IsOutputRedirected;
|
bool pipeToStdout = args.Contains("--pipe") || args.Contains("-p");
|
||||||
bool copyToClipboard = args.Contains("--copy");
|
bool copyToClipboard = args.Contains("--copy");
|
||||||
|
bool verbose = args.Contains("-v") || args.Contains("--verbose");
|
||||||
|
|
||||||
|
Logger.Verbose = verbose;
|
||||||
|
|
||||||
|
|
||||||
string command = "";
|
string command = "";
|
||||||
@@ -29,6 +32,7 @@ if (args.Contains("-h") || args.Contains("--help") || (string.IsNullOrEmpty(comm
|
|||||||
Console.WriteLine(" -h, --help - Show this help message");
|
Console.WriteLine(" -h, --help - Show this help message");
|
||||||
Console.WriteLine(" -p, --pipe - Output transcription to stdout instead of typing");
|
Console.WriteLine(" -p, --pipe - Output transcription to stdout instead of typing");
|
||||||
Console.WriteLine(" --copy - Copy to clipboard instead of typing");
|
Console.WriteLine(" --copy - Copy to clipboard instead of typing");
|
||||||
|
Console.WriteLine(" -v, --verbose - Enable detailed debug logging");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -71,6 +75,17 @@ if (command == "onboard")
|
|||||||
var backend = Console.ReadLine();
|
var backend = Console.ReadLine();
|
||||||
if (!string.IsNullOrWhiteSpace(backend)) config.TypingBackend = backend.ToLowerInvariant();
|
if (!string.IsNullOrWhiteSpace(backend)) config.TypingBackend = backend.ToLowerInvariant();
|
||||||
|
|
||||||
|
Console.WriteLine();
|
||||||
|
var availableSkills = Toak.Core.Skills.SkillRegistry.AllSkills.Select(s => s.Name);
|
||||||
|
Console.WriteLine($"Active Skills (comma separated) [{string.Join(", ", config.ActiveSkills)}]:");
|
||||||
|
Console.WriteLine($" Available: {string.Join(", ", availableSkills)}");
|
||||||
|
Console.Write("Selection: ");
|
||||||
|
var skillsInput = Console.ReadLine();
|
||||||
|
if (!string.IsNullOrWhiteSpace(skillsInput))
|
||||||
|
{
|
||||||
|
config.ActiveSkills = skillsInput.Split(',', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries).ToList();
|
||||||
|
}
|
||||||
|
|
||||||
ConfigManager.SaveConfig(config);
|
ConfigManager.SaveConfig(config);
|
||||||
Console.WriteLine("Configuration saved.");
|
Console.WriteLine("Configuration saved.");
|
||||||
return;
|
return;
|
||||||
@@ -85,6 +100,7 @@ if (command == "show")
|
|||||||
Console.WriteLine($" Whisper Model: {config.WhisperModel}");
|
Console.WriteLine($" Whisper Model: {config.WhisperModel}");
|
||||||
Console.WriteLine($" Spoken Language: {(string.IsNullOrEmpty(config.WhisperLanguage) ? "Auto" : config.WhisperLanguage)}");
|
Console.WriteLine($" Spoken Language: {(string.IsNullOrEmpty(config.WhisperLanguage) ? "Auto" : config.WhisperLanguage)}");
|
||||||
Console.WriteLine($" Typing Backend: {config.TypingBackend}");
|
Console.WriteLine($" Typing Backend: {config.TypingBackend}");
|
||||||
|
Console.WriteLine($" Active Skills: {string.Join(", ", config.ActiveSkills)}");
|
||||||
Console.WriteLine($" Style Mode: {config.StyleMode}");
|
Console.WriteLine($" Style Mode: {config.StyleMode}");
|
||||||
Console.WriteLine($" Punctuation Module: {config.ModulePunctuation}");
|
Console.WriteLine($" Punctuation Module: {config.ModulePunctuation}");
|
||||||
Console.WriteLine($" Technical Sanitization: {config.ModuleTechnicalSanitization}");
|
Console.WriteLine($" Technical Sanitization: {config.ModuleTechnicalSanitization}");
|
||||||
@@ -244,12 +260,15 @@ if (command == "toggle")
|
|||||||
{
|
{
|
||||||
if (StateTracker.IsRecording())
|
if (StateTracker.IsRecording())
|
||||||
{
|
{
|
||||||
|
var config = ConfigManager.LoadConfig();
|
||||||
|
Notifications.PlaySound(config.StopSoundPath);
|
||||||
|
|
||||||
if (!pipeToStdout) Console.WriteLine("Stopping recording and transcribing...");
|
if (!pipeToStdout) Console.WriteLine("Stopping recording and transcribing...");
|
||||||
if (!pipeToStdout) Notifications.Notify("Toak", "Transcribing...");
|
if (!pipeToStdout) Notifications.Notify("Toak", "Transcribing...");
|
||||||
|
|
||||||
AudioRecorder.StopRecording();
|
AudioRecorder.StopRecording();
|
||||||
|
|
||||||
var config = ConfigManager.LoadConfig();
|
Logger.LogDebug($"Loaded configuration: LLM={config.LlmModel}, Whisper={config.WhisperModel}, Typing={config.TypingBackend}");
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(config.GroqApiKey))
|
if (string.IsNullOrWhiteSpace(config.GroqApiKey))
|
||||||
{
|
{
|
||||||
@@ -271,7 +290,9 @@ if (command == "toggle")
|
|||||||
var stopWatch = Stopwatch.StartNew();
|
var stopWatch = Stopwatch.StartNew();
|
||||||
|
|
||||||
// 1. STT
|
// 1. STT
|
||||||
|
Logger.LogDebug($"Starting STT transcription via Whisper for {wavPath}...");
|
||||||
var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage, config.WhisperModel);
|
var transcript = await groq.TranscribeAsync(wavPath, config.WhisperLanguage, config.WhisperModel);
|
||||||
|
Logger.LogDebug($"Raw transcript received: '{transcript}'");
|
||||||
|
|
||||||
if (string.IsNullOrWhiteSpace(transcript))
|
if (string.IsNullOrWhiteSpace(transcript))
|
||||||
{
|
{
|
||||||
@@ -279,32 +300,64 @@ if (command == "toggle")
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
string finalText = transcript;
|
|
||||||
|
|
||||||
// 2. LLM Refinement
|
// 2. LLM Refinement
|
||||||
var systemPrompt = PromptBuilder.BuildPrompt(config);
|
var detectedSkill = Toak.Core.Skills.SkillRegistry.DetectSkill(transcript, config.ActiveSkills);
|
||||||
finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
|
string systemPrompt;
|
||||||
|
if (detectedSkill != null)
|
||||||
|
{
|
||||||
|
Logger.LogDebug($"Skill detected: {detectedSkill.Name}");
|
||||||
|
if (!pipeToStdout) Notifications.Notify("Toak Skill Detected", detectedSkill.Name);
|
||||||
|
systemPrompt = detectedSkill.GetSystemPrompt(transcript);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
systemPrompt = PromptBuilder.BuildPrompt(config);
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Output
|
||||||
|
if (detectedSkill != null && detectedSkill.HandlesExecution)
|
||||||
|
{
|
||||||
|
Logger.LogDebug($"Executing skill synchronously: {detectedSkill.Name}");
|
||||||
|
string finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
|
||||||
|
Logger.LogDebug($"Skill refined text: '{finalText}'");
|
||||||
|
if (string.IsNullOrWhiteSpace(finalText))
|
||||||
|
{
|
||||||
|
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
detectedSkill.Execute(finalText);
|
||||||
|
stopWatch.Stop();
|
||||||
|
Notifications.Notify("Toak", $"Skill executed in {stopWatch.ElapsedMilliseconds}ms");
|
||||||
|
}
|
||||||
|
else if (pipeToStdout || copyToClipboard)
|
||||||
|
{
|
||||||
|
Logger.LogDebug("Starting LLM text refinement (synchronous)...");
|
||||||
|
string finalText = await groq.RefineTextAsync(transcript, systemPrompt, config.LlmModel);
|
||||||
|
Logger.LogDebug($"Refined text received: '{finalText}'");
|
||||||
if (string.IsNullOrWhiteSpace(finalText))
|
if (string.IsNullOrWhiteSpace(finalText))
|
||||||
{
|
{
|
||||||
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
|
if (!pipeToStdout) Notifications.Notify("Toak", "Dropped short or empty audio.");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Output
|
|
||||||
if (pipeToStdout)
|
if (pipeToStdout)
|
||||||
{
|
{
|
||||||
Console.WriteLine(finalText);
|
Console.WriteLine(finalText);
|
||||||
}
|
}
|
||||||
else if (copyToClipboard)
|
else
|
||||||
{
|
{
|
||||||
ClipboardManager.Copy(finalText);
|
ClipboardManager.Copy(finalText);
|
||||||
stopWatch.Stop();
|
stopWatch.Stop();
|
||||||
Notifications.Notify("Toak", $"Copied to clipboard in {stopWatch.ElapsedMilliseconds}ms");
|
Notifications.Notify("Toak", $"Copied to clipboard in {stopWatch.ElapsedMilliseconds}ms");
|
||||||
}
|
}
|
||||||
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
TextInjector.Inject(finalText, config.TypingBackend);
|
Logger.LogDebug("Starting LLM text refinement (streaming)...");
|
||||||
|
var tokenStream = groq.RefineTextStreamAsync(transcript, systemPrompt, config.LlmModel);
|
||||||
|
Logger.LogDebug("Starting to inject text...");
|
||||||
|
await TextInjector.InjectStreamAsync(tokenStream, config.TypingBackend);
|
||||||
stopWatch.Stop();
|
stopWatch.Stop();
|
||||||
Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms");
|
Notifications.Notify("Toak", $"Done in {stopWatch.ElapsedMilliseconds}ms");
|
||||||
}
|
}
|
||||||
@@ -323,6 +376,8 @@ if (command == "toggle")
|
|||||||
{
|
{
|
||||||
// Start recording
|
// Start recording
|
||||||
if (!pipeToStdout) Console.WriteLine("Starting recording...");
|
if (!pipeToStdout) Console.WriteLine("Starting recording...");
|
||||||
|
var config = ConfigManager.LoadConfig();
|
||||||
|
Notifications.PlaySound(config.StartSoundPath);
|
||||||
AudioRecorder.StartRecording();
|
AudioRecorder.StartRecording();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -14,6 +14,10 @@ namespace Toak.Serialization;
|
|||||||
[JsonSerializable(typeof(LlamaChoice))]
|
[JsonSerializable(typeof(LlamaChoice))]
|
||||||
[JsonSerializable(typeof(LlamaRequestMessage[]))]
|
[JsonSerializable(typeof(LlamaRequestMessage[]))]
|
||||||
[JsonSerializable(typeof(LlamaChoice[]))]
|
[JsonSerializable(typeof(LlamaChoice[]))]
|
||||||
|
[JsonSerializable(typeof(LlamaStreamResponse))]
|
||||||
|
[JsonSerializable(typeof(LlamaStreamChoice))]
|
||||||
|
[JsonSerializable(typeof(LlamaStreamDelta))]
|
||||||
|
[JsonSerializable(typeof(LlamaStreamChoice[]))]
|
||||||
internal partial class AppJsonSerializerContext : JsonSerializerContext
|
internal partial class AppJsonSerializerContext : JsonSerializerContext
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,4 +8,8 @@
|
|||||||
<PublishAot>true</PublishAot>
|
<PublishAot>true</PublishAot>
|
||||||
</PropertyGroup>
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<EmbeddedResource Include="Assets\Audio\**" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
</Project>
|
</Project>
|
||||||
|
|||||||
Reference in New Issue
Block a user