C#通过正则表达式抓取网页信息的类详解编程语言

using System; 
using System.Data; 
using System.Configuration; 
using System.Net; 
using System.IO; 
using System.Text; 
using System.Collections.Generic; 
using System.Text.RegularExpressions; 
using System.Threading; 
using System.Web; 
using System.Web.UI.MobileControls; 
/// <summary> 
/// 网页类 
/// </summary> 
public class WebPage 
{ 
#region 私有成员 
private Uri m_uri;   //url 
private List<Link> m_links;    //此网页上的链接 
private string m_title;        //标题 
private string m_html;         //HTML代码 
private string m_outstr;       //网页可输出的纯文本 
private bool m_good;           //网页是否可用 
private int m_pagesize;       //网页的大小 
private static Dictionary<string, CookieContainer> webcookies = new Dictionary<string, CookieContainer>();//存放所有网页的Cookie 
#endregion 
#region 属性 
/// <summary> 
/// 通过此属性可获得本网页的网址，只读 
/// </summary> 
public string URL 
{ 
get 
{ 
return m_uri.AbsoluteUri; 
} 
} 
/// <summary> 
/// 通过此属性可获得本网页的标题，只读 
/// </summary> 
public string Title 
{ 
get 
{ 
if (m_title == "") 
{ 
Regex reg = new Regex(@"(?m)<title[^>]*>(?<title>(?:/w|/W)*?)</title[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase); 
Match mc = reg.Match(m_html); 
if (mc.Success) 
m_title = mc.Groups["title"].Value.Trim(); 
} 
return m_title; 
} 
} 
public string M_html 
{ 
get 
{ 
if (m_html == null) 
{ 
m_html = ""; 
} 
return m_html; 
} 
} 
/// <summary> 
/// 此属性获得本网页的所有链接信息，只读 
/// </summary> 
public List<Link> Links 
{ 
get 
{ 
if (m_links.Count == 0) getLinks(); 
return m_links; 
} 
} 
/// <summary> 
/// 此属性返回本网页的全部纯文本信息，只读 
/// </summary> 
public string Context 
{ 
get 
{ 
if (m_outstr == "") getContext(Int16.MaxValue); 
return m_outstr; 
} 
} 
/// <summary> 
/// 此属性获得本网页的大小 
/// </summary> 
public int PageSize 
{ 
get 
{ 
return m_pagesize; 
} 
} 
/// <summary> 
/// 此属性获得本网页的所有站内链接 
/// </summary> 
public List<Link> InsiteLinks 
{ 
get 
{ 
return getSpecialLinksByUrl("^http://" + m_uri.Host, Int16.MaxValue); 
} 
} 
/// <summary> 
/// 此属性表示本网页是否可用 
/// </summary> 
public bool IsGood 
{ 
get 
{ 
return m_good; 
} 
} 
/// <summary> 
/// 此属性表示网页的所在的网站 
/// </summary> 
public string Host 
{ 
get 
{ 
return m_uri.Host; 
} 
} 
#endregion 
/// <summary> 
/// 从HTML代码中分析出链接信息 
/// </summary> 
/// <returns>List<Link></returns> 
private List<Link> getLinks() 
{ 
if (m_links.Count == 0) 
{ 
Regex[] regex = new Regex[2]; 
regex[0] = new Regex(@"<a/shref/s*=""(?<URL>[^""]*).*?>(?<title>[^<]*)</a>", RegexOptions.IgnoreCase | RegexOptions.Singleline); 
regex[1] = new Regex("<[i]*frame[^><]+src=(/"|')?(?<url>([^>/"'//s)])+)(/"|')?[^>]*>", RegexOptions.IgnoreCase); 
for (int i = 0; i < 2; i++) 
{ 
Match match = regex[i].Match(m_html); 
while (match.Success) 
{ 
try 
{ 
string url = HttpUtility.UrlDecode(new Uri(m_uri, match.Groups["URL"].Value).AbsoluteUri); 
string text = ""; 
if (i == 0) text = new Regex("(<[^>]+>)|(//s)|(&nbsp;)|&|/"", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groups["text"].Value, ""); 
Link link = new Link(); 
link.Text = text; 
link.NavigateUrl = url; 
m_links.Add(link); 
} 
catch (Exception ex) { Console.WriteLine(ex.Message); }; 
match = match.NextMatch(); 
} 
} 
} 
return m_links; 
} 
/// <summary> 
/// 此私有方法从一段HTML文本中提取出一定字数的纯文本 
/// </summary> 
/// <param name="instr">HTML代码</param> 
/// <param name="firstN">提取从头数多少个字</param> 
/// <param name="withLink">是否要链接里面的字</param> 
/// <returns>纯文本</returns> 
private string getFirstNchar(string instr, int firstN, bool withLink) 
{ 
if (m_outstr == "") 
{ 
m_outstr = instr.Clone() as string; 
m_outstr = new Regex(@"(?m)<script[^>]*>(/w|/W)*?</script[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 
m_outstr = new Regex(@"(?m)<style[^>]*>(/w|/W)*?</style[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 
m_outstr = new Regex(@"(?m)<select[^>]*>(/w|/W)*?</select[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 
if (!withLink) m_outstr = new Regex(@"(?m)<a[^>]*>(/w|/W)*?</a[^>]*>", RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ""); 
Regex objReg = new System.Text.RegularExpressions.Regex("(<[^>]+?>)|&nbsp;", RegexOptions.Multiline | RegexOptions.IgnoreCase); 
m_outstr = objReg.Replace(m_outstr, ""); 
Regex objReg2 = new System.Text.RegularExpressions.Regex("(//s)+", RegexOptions.Multiline | RegexOptions.IgnoreCase); 
m_outstr = objReg2.Replace(m_outstr, " "); 
} 
return m_outstr.Length > firstN ? m_outstr.Substring(0, firstN) : m_outstr; 
} 
#region 公有文法 
/// <summary> 
/// 此公有方法提取网页中一定字数的纯文本，包括链接文字 
/// </summary> 
/// <param name="firstN">字数</param> 
/// <returns></returns> 
public string getContext(int firstN) 
{ 
return getFirstNchar(m_html, firstN, true); 
} 
/// <summary> 
/// 此公有方法从本网页的链接中提取一定数量的链接，该链接的URL满足某正则式 
/// </summary> 
/// <param name="pattern">正则式</param> 
/// <param name="count">返回的链接的个数</param> 
/// <returns>List<Link></returns> 
public List<Link> getSpecialLinksByUrl(string pattern, int count) 
{ 
if (m_links.Count == 0) getLinks(); 
List<Link> SpecialLinks = new List<Link>(); 
List<Link>.Enumerator i; 
i = m_links.GetEnumerator(); 
int cnt = 0; 
while (i.MoveNext() && cnt < count) 
{ 
if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.NavigateUrl).Success) 
{ 
SpecialLinks.Add(i.Current); 
cnt++; 
} 
} 
return SpecialLinks; 
} 
/// <summary> 
/// 此公有方法从本网页的链接中提取一定数量的链接，该链接的文字满足某正则式 
/// </summary> 
/// <param name="pattern">正则式</param> 
/// <param name="count">返回的链接的个数</param> 
/// <returns>List<Link></returns> 
public List<Link> getSpecialLinksByText(string pattern, int count) 
{ 
if (m_links.Count == 0) getLinks(); 
List<Link> SpecialLinks = new List<Link>(); 
List<Link>.Enumerator i; 
i = m_links.GetEnumerator(); 
int cnt = 0; 
while (i.MoveNext() && cnt < count) 
{ 
if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase).Match(i.Current.Text).Success) 
{ 
SpecialLinks.Add(i.Current); 
cnt++; 
} 
} 
return SpecialLinks; 
} 
/// <summary> 
/// 这公有方法提取本网页的纯文本中满足某正则式的文字 
/// </summary> 
/// <param name="pattern">正则式</param> 
/// <returns>返回文字</returns> 
public string getSpecialWords(string pattern) 
{ 
if (m_outstr == "") getContext(Int16.MaxValue); 
Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase); 
Match mc = regex.Match(m_outstr); 
if (mc.Success) 
return mc.Groups[1].Value; 
return string.Empty; 
} 
#endregion 
#region 构造函数 
private void Init(string _url) 
{ 
try 
{ 
m_uri = new Uri(_url); 
m_links = new List<Link>(); 
m_html = ""; 
m_outstr = ""; 
m_title = ""; 
m_good = true; 
if (_url.EndsWith(".rar") || _url.EndsWith(".dat") || _url.EndsWith(".msi")) 
{ 
m_good = false; 
return; 
} 
HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); 
rqst.AllowAutoRedirect = true; 
rqst.MaximumAutomaticRedirections = 3; 
rqst.UserAgent = "Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)"; 
rqst.KeepAlive = true; 
rqst.Timeout = 10000; 
lock (WebPage.webcookies) 
{ 
if (WebPage.webcookies.ContainsKey(m_uri.Host)) 
rqst.CookieContainer = WebPage.webcookies[m_uri.Host]; 
else 
{ 
CookieContainer cc = new CookieContainer(); 
WebPage.webcookies[m_uri.Host] = cc; 
rqst.CookieContainer = cc; 
} 
} 
HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); 
Stream sm = rsps.GetResponseStream(); 
if (!rsps.ContentType.ToLower().StartsWith("text/") || rsps.ContentLength > 1 << 22) 
{ 
rsps.Close(); 
m_good = false; 
return; 
} 
Encoding cding = System.Text.Encoding.Default; 
string contenttype = rsps.ContentType.ToLower(); 
int ix = contenttype.IndexOf("charset="); 
if (ix != -1) 
{ 
try 
{ 
cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + "charset".Length + 1)); 
} 
catch 
{ 
cding = Encoding.Default; 
} 
//该处视情况而定 有的需要解码 
//m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd()); 
m_html = new StreamReader(sm, cding).ReadToEnd(); 
} 
else 
{ 
//该处视情况而定 有的需要解码 
//m_html = HttpUtility.HtmlDecode(new StreamReader(sm, cding).ReadToEnd()); 
m_html = new StreamReader(sm, cding).ReadToEnd(); 
Regex regex = new Regex("charset=(?<cding>[^=]+)?/"", RegexOptions.IgnoreCase); 
string strcding = regex.Match(m_html).Groups["cding"].Value; 
try 
{ 
cding = Encoding.GetEncoding(strcding); 
} 
catch 
{ 
cding = Encoding.Default; 
} 
byte[] bytes = Encoding.Default.GetBytes(m_html.ToCharArray()); 
m_html = cding.GetString(bytes); 
if (m_html.Split('?').Length > 100) 
{ 
m_html = Encoding.Default.GetString(bytes); 
} 
} 
m_pagesize = m_html.Length; 
m_uri = rsps.ResponseUri; 
rsps.Close(); 
} 
catch (Exception ex) 
{ 
} 
} 
public WebPage(string _url) 
{ 
string uurl = ""; 
try 
{ 
uurl = Uri.UnescapeDataString(_url); 
_url = uurl; 
} 
catch { }; 
Init(_url); 
} 
#endregion 
}

调用演示

WebPage webInfo = new WebPage("网址"); 
webInfo.Context;//不包含html标签的所有内容 
webInfo.M_html;//包含html标签的内容

原创文章，作者：ItWorker，如若转载，请注明出处：https://blog.ytso.com/11126.html

C#通过正则表达式抓取网页信息的类详解编程语言

相关推荐

发表回复