8/18/2007

ASP.NET: REGEX Parse the RSS / ATOM Feed Url from a Page

2 is not equal to 3, not even for large values of 2. - Grabel's Law

I've been scraping again, I confess. Just can't resist it. One of the things I've run into when grabbing a bunch of web pages in a threadpool callback is how to determine if the page sports the autodiscovery tags (e.g. there is a feed for the site).

Here is one way to do this with a little bit of REGEX:

using System.Text.RegularExpressions;

namespace WebLogsSearcher
{
public static class Matcher
{
public static string Parse(string htmldata)
{
Regex linkregex =
new Regex(@"<link\s*(?:(?:\b(\w-)+\b\s*(?:=\s*(?:""[^""]*""'" +
@"[^']*'[^""'<> ]+)\s*)?)*)/?\s*>",
RegexOptions.IgnoreCase RegexOptions.ExplicitCapture);

string url = "";
foreach (Match linkmatch in linkregex.Matches(htmldata))
{
bool ok = false;

Regex sublinkregex =
new Regex(@"(?<name>\b(\w-)+\b)\" +
@"s*=\s*(""(?<value>" +
@"[^""]*)""'(?<value>[^']*)'" +
@"(?<value>[^""'<> ]+)\s*)+",
RegexOptions.IgnoreCase
RegexOptions.ExplicitCapture);

foreach (Match sublinkmatch in sublinkregex.Matches(linkmatch.Value.ToString()))
{
if ("type" == sublinkmatch.Groups["name"].ToString().ToLower()
&&
(sublinkmatch.Groups["value"].ToString() == "application/atom+xml" ||
sublinkmatch.Groups["value"].ToString() == "application/rss+xml"))
{
ok = true;
}

if ("href" == sublinkmatch.Groups["name"].ToString().ToLower() && ok)
{
url = sublinkmatch.Groups["value"].ToString();
}
}
}
return url;
}
}
}