using System.Collections.Concurrent; using OpenQuery.Models; using OpenQuery.Services; namespace OpenQuery.Tools; public class SearchTool { private readonly SearxngClient _searxngClient; private readonly EmbeddingService _embeddingService; private readonly ParallelProcessingOptions _options; public static string Name => "search"; public static string Description => "Search the web for information on a topic"; public SearchTool( SearxngClient searxngClient, EmbeddingService embeddingService) { _searxngClient = searxngClient; _embeddingService = embeddingService; _options = new ParallelProcessingOptions(); } public async Task ExecuteAsync( string originalQuery, List generatedQueries, int maxResults, int topChunksLimit, Action? onProgress = null, bool verbose = true) { // Phase 1: Parallel Searches var searchResults = await ExecuteParallelSearchesAsync(generatedQueries, maxResults, onProgress, verbose); if (searchResults.Count == 0) return "No search results found."; // Phase 2: Parallel Article Fetching var chunks = await ExecuteParallelArticleFetchingAsync(searchResults, onProgress, verbose); if (chunks.Count == 0) return "Found search results but could not extract readable content."; // Phase 3: Parallel Embeddings with Rate Limiting var (queryEmbedding, chunkEmbeddings) = await ExecuteParallelEmbeddingsAsync( originalQuery, chunks, onProgress, verbose); // Phase 4: Ranking var topChunks = RankAndSelectTopChunks(chunks, chunkEmbeddings, queryEmbedding, topChunksLimit); onProgress?.Invoke($"[Found top {topChunks.Count} most relevant chunks overall. Generating answer...]"); var context = string.Join("\n\n", topChunks.Select((c, i) => $"[Source {i + 1}: {c.Title ?? "Unknown"}]({c.SourceUrl})\n{c.Content}")); return context; } private async Task> ExecuteParallelSearchesAsync( List generatedQueries, int maxResults, Action? onProgress, bool verbose) { var allResults = new ConcurrentBag(); var searchTasks = generatedQueries.Select(async query => { onProgress?.Invoke($"[Searching web for '{query}'...]"); try { var results = await _searxngClient.SearchAsync(query, maxResults); foreach (var result in results) { allResults.Add(result); } } catch (Exception ex) { if (verbose) { Console.WriteLine($"Warning: Search failed for query '{query}': {ex.Message}"); } } }); await Task.WhenAll(searchTasks); var uniqueResults = allResults.DistinctBy(r => r.Url).ToList(); return uniqueResults; } private async Task> ExecuteParallelArticleFetchingAsync( List searchResults, Action? onProgress, bool verbose) { var chunks = new ConcurrentBag(); var completedFetches = 0; var totalFetches = searchResults.Count; var semaphore = new SemaphoreSlim(_options.MaxConcurrentArticleFetches); var fetchTasks = searchResults.Select(async result => { await semaphore.WaitAsync(); try { var current = Interlocked.Increment(ref completedFetches); var uri = new Uri(result.Url); var domain = uri.Host; onProgress?.Invoke($"[Fetching article {current}/{totalFetches}: {domain}]"); try { var article = await ArticleService.FetchArticleAsync(result.Url); if (!article.IsReadable || string.IsNullOrEmpty(article.TextContent)) return; var textChunks = ChunkingService.ChunkText(article.TextContent); foreach (var chunkText in textChunks) { chunks.Add(new Chunk(chunkText, result.Url, article.Title)); } } catch (Exception ex) { if (verbose) { Console.WriteLine($"Warning: Failed to fetch article {result.Url}: {ex.Message}"); } } } finally { semaphore.Release(); } }); await Task.WhenAll(fetchTasks); return chunks.ToList(); } private async Task<(float[] queryEmbedding, float[][] chunkEmbeddings)> ExecuteParallelEmbeddingsAsync( string originalQuery, List chunks, Action? onProgress, bool verbose) { onProgress?.Invoke($"[Generating embeddings for {chunks.Count} chunks and query...]"); // Start query embedding and chunk embeddings concurrently var queryEmbeddingTask = _embeddingService.GetEmbeddingAsync(originalQuery); var chunkTexts = chunks.Select(c => c.Content).ToList(); var chunkEmbeddingsTask = _embeddingService.GetEmbeddingsWithRateLimitAsync( chunkTexts, onProgress); await Task.WhenAll(queryEmbeddingTask, chunkEmbeddingsTask); var queryEmbedding = await queryEmbeddingTask; var chunkEmbeddings = await chunkEmbeddingsTask; // Filter out any chunks with empty embeddings (failed batches) var validChunks = new List(); var validEmbeddings = new List(); for (var i = 0; i < chunks.Count; i++) { if (chunkEmbeddings[i].Length > 0) { validChunks.Add(chunks[i]); validEmbeddings.Add(chunkEmbeddings[i]); } } // Update chunks with embeddings for (var i = 0; i < validChunks.Count; i++) { validChunks[i].Embedding = validEmbeddings[i]; } return (queryEmbedding, validEmbeddings.ToArray()); } private List RankAndSelectTopChunks( List chunks, float[][] chunkEmbeddings, float[] queryEmbedding, int topChunksLimit) { // Filter to only chunks that have embeddings var chunksWithEmbeddings = chunks.Where(c => c.Embedding != null).ToList(); foreach (var chunk in chunksWithEmbeddings) { chunk.Score = EmbeddingService.CosineSimilarity(queryEmbedding, chunk.Embedding!); } var topChunks = chunksWithEmbeddings .OrderByDescending(c => c.Score) .Take(topChunksLimit) .ToList(); return topChunks; } public static string Execute(string argumentsJson) { throw new InvalidOperationException("Use ExecuteAsync instead"); } }