using System; using System.Collections.Generic; using System.IO; using System.IO.Compression; using System.Net.Http; using System.Threading; using System.Threading.Tasks; using System.Xml.Linq; class Program { static async Task Main(string[] args) { if (args.Length == 0) { Console.WriteLine("Usage: dotnet run ..."); return; } var allUrls = new Dictionary(StringComparer.OrdinalIgnoreCase); // Collect URLs from all sitemaps foreach (string sitemapUrl in args) { Console.WriteLine($"\nProcessing sitemap: {sitemapUrl}"); try { await GetUrlsFromSitemapAsync(sitemapUrl, allUrls, sitemapUrl); } catch (Exception ex) { Console.Error.WriteLine($"Error processing {sitemapUrl}: {ex.Message}"); } } Console.WriteLine($"\nTotal URLs collected: {allUrls.Count}"); // Output filename for combined CSV string timestamp = DateTime.UtcNow.ToString("yyyyMMdd_HHmmss"); string outputFile = $"sitemap_results_{timestamp}.csv"; // Check all URLs and write combined CSV await CheckUrlsAsync(allUrls, outputFile); Console.WriteLine($"\nCombined results written to: {outputFile}"); } static async Task DownloadSitemapAsync(string sitemapUrl) { using var httpClient = new HttpClient(); byte[] data; try { data = await httpClient.GetByteArrayAsync(sitemapUrl); } catch (Exception ex) { Console.Error.WriteLine($"Failed to fetch sitemap {sitemapUrl}: {ex.Message}"); return null; } if (sitemapUrl.EndsWith(".gz", StringComparison.OrdinalIgnoreCase)) { try { using var compressedStream = new MemoryStream(data); using var gzipStream = new GZipStream(compressedStream, CompressionMode.Decompress); using var reader = new StreamReader(gzipStream); return await reader.ReadToEndAsync(); } catch (Exception ex) { Console.Error.WriteLine($"Failed to decompress {sitemapUrl}: {ex.Message}"); return null; } } else { return System.Text.Encoding.UTF8.GetString(data); } } static async Task GetUrlsFromSitemapAsync(string sitemapUrl, Dictionary collectedUrls, string sourceSitemap) { string xmlContent = await DownloadSitemapAsync(sitemapUrl); if (xmlContent == null) return; XDocument doc; try { doc = XDocument.Parse(xmlContent); } catch (Exception ex) { Console.Error.WriteLine($"Failed to parse XML from {sitemapUrl}: {ex.Message}"); return; } var rootName = doc.Root?.Name.LocalName; if (rootName == "sitemapindex") { foreach (var loc in doc.Descendants("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")) { string nestedSitemap = loc.Value.Trim(); await GetUrlsFromSitemapAsync(nestedSitemap, collectedUrls, nestedSitemap); // recursion } } else if (rootName == "urlset") { foreach (var loc in doc.Descendants("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")) { string url = loc.Value.Trim(); // Only add if not already present if (!collectedUrls.ContainsKey(url)) { collectedUrls[url] = sourceSitemap; } } } } static async Task CheckUrlsAsync(Dictionary urlsWithSources, string outputFile) { using var httpClient = new HttpClient(); var semaphore = new SemaphoreSlim(10); // limit concurrency var tasks = new List(); var results = new List(); results.Add("URL,StatusCode,X-Cache,MethodUsed,SourceSitemap"); foreach (var kvp in urlsWithSources) { string url = kvp.Key; string sourceSitemap = kvp.Value; await semaphore.WaitAsync(); tasks.Add(Task.Run(async () => { try { var (statusCode, xCache, methodUsed) = await HeadThenGetAsync(httpClient, url); string line = $"{url},{statusCode},{xCache},{methodUsed},{sourceSitemap}"; lock (results) { results.Add(line); } Console.WriteLine(line); } catch (Exception ex) { string line = $"{url},ERROR: {ex.Message},,,{sourceSitemap}"; lock (results) { results.Add(line); } Console.WriteLine(line); } finally { semaphore.Release(); } })); } await Task.WhenAll(tasks); await File.WriteAllLinesAsync(outputFile, results); } static async Task<(string statusCode, string xCache, string methodUsed)> HeadThenGetAsync(HttpClient httpClient, string url) { try { // Try HEAD first var headRequest = new HttpRequestMessage(HttpMethod.Head, url); var headResponse = await httpClient.SendAsync(headRequest); if (headResponse.IsSuccessStatusCode) { string xCache = headResponse.Headers.Contains("X-Cache") ? string.Join(";", headResponse.Headers.GetValues("X-Cache")) : ""; return ($"{(int)headResponse.StatusCode}", xCache, "HEAD"); } else { // Retry with GET if HEAD response not OK var getRequest = new HttpRequestMessage(HttpMethod.Get, url); var getResponse = await httpClient.SendAsync(getRequest); string xCache = getResponse.Headers.Contains("X-Cache") ? string.Join(";", getResponse.Headers.GetValues("X-Cache")) : ""; return ($"{(int)getResponse.StatusCode}", xCache, "GET"); } } catch { // Retry with GET if HEAD threw an exception var getRequest = new HttpRequestMessage(HttpMethod.Get, url); var getResponse = await httpClient.SendAsync(getRequest); string xCache = getResponse.Headers.Contains("X-Cache") ? string.Join(";", getResponse.Headers.GetValues("X-Cache")) : ""; return ($"{(int)getResponse.StatusCode}", xCache, "GET"); } } }