C#網(wǎng)絡(luò)爬蟲代碼分享 C#簡(jiǎn)單的爬取工具
公司編輯妹子需要爬取網(wǎng)頁(yè)內(nèi)容,叫我?guī)兔ψ隽艘缓?jiǎn)單的爬取工具
這是爬取網(wǎng)頁(yè)內(nèi)容,像是這對(duì)大家來說都是不難得,但是在這里有一些小改動(dòng),代碼獻(xiàn)上,大家參考
private string GetHttpWebRequest(string url)
{
HttpWebResponse result;
string strHTML = string.Empty;
try
{
Uri uri = new Uri(url);
WebRequest webReq = WebRequest.Create(uri);
WebResponse webRes = webReq.GetResponse();
HttpWebRequest myReq = (HttpWebRequest)webReq;
myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
myReq.Accept = "*/*";
myReq.KeepAlive = true;
myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
result = (HttpWebResponse)myReq.GetResponse();
Stream receviceStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("utf-8"));
strHTML = readerOfStream.ReadToEnd();
readerOfStream.Close();
receviceStream.Close();
result.Close();
}
catch
{
Uri uri = new Uri(url);
WebRequest webReq = WebRequest.Create(uri);
HttpWebRequest myReq = (HttpWebRequest)webReq;
myReq.UserAgent = "User-Agent:Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.2; .NET CLR 1.0.3705";
myReq.Accept = "*/*";
myReq.KeepAlive = true;
myReq.Headers.Add("Accept-Language", "zh-cn,en-us;q=0.5");
//result = (HttpWebResponse)myReq.GetResponse();
try
{
result = (HttpWebResponse)myReq.GetResponse();
}
catch (WebException ex)
{
result = (HttpWebResponse)ex.Response;
}
Stream receviceStream = result.GetResponseStream();
StreamReader readerOfStream = new StreamReader(receviceStream, System.Text.Encoding.GetEncoding("gb2312"));
strHTML = readerOfStream.ReadToEnd();
readerOfStream.Close();
receviceStream.Close();
result.Close();
}
return strHTML;
}
這是根據(jù)url爬取網(wǎng)頁(yè)遠(yuǎn)嗎,有一些小改動(dòng),很多網(wǎng)頁(yè)有不同的編碼格式,甚至有些網(wǎng)站做了反爬取的防范,這個(gè)方法經(jīng)過能夠改動(dòng)也能爬去
以下是爬取網(wǎng)頁(yè)所有的網(wǎng)址鏈接
/// <summary>
/// 提取HTML代碼中的網(wǎng)址
/// </summary>
/// <param name="htmlCode"></param>
/// <returns></returns>
private static List<string> GetHyperLinks(string htmlCode, string url)
{
ArrayList al = new ArrayList();
bool IsGenxin = false;
StringBuilder weburlSB = new StringBuilder();//SQL
StringBuilder linkSb = new StringBuilder();//展示數(shù)據(jù)
List<string> Weburllistzx = new List<string>();//新增
List<string> Weburllist = new List<string>();//舊的
string ProductionContent = htmlCode;
Regex reg = new Regex(@"http(s)?://([\w-]+\.)+[\w-]+/?");
string wangzhanyuming = reg.Match(url, 0).Value;
MatchCollection mc = Regex.Matches(ProductionContent.Replace("href=\"/", "href=\"" + wangzhanyuming).Replace("href='/", "href='" + wangzhanyuming).Replace("href=/", "href=" + wangzhanyuming).Replace("href=\"./", "href=\"" + wangzhanyuming), @"<[aA][^>]* href=[^>]*>", RegexOptions.Singleline);
int Index = 1;
foreach (Match m in mc)
{
MatchCollection mc1 = Regex.Matches(m.Value, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline);
if (mc1.Count > 0)
{
foreach (Match m1 in mc1)
{
string linkurlstr = string.Empty;
linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
weburlSB.Append("$-$");
weburlSB.Append(linkurlstr);
weburlSB.Append("$_$");
if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
{
IsGenxin = true;
Weburllistzx.Add(linkurlstr);
linkSb.AppendFormat("{0}<br/>", linkurlstr);
}
}
}
else
{
if (m.Value.IndexOf("javascript") == -1)
{
string amstr = string.Empty;
string wangzhanxiangduilujin = string.Empty;
wangzhanxiangduilujin = url.Substring(0, url.LastIndexOf("/") + 1);
amstr = m.Value.Replace("href=\"", "href=\"" + wangzhanxiangduilujin).Replace("href='", "href='" + wangzhanxiangduilujin);
MatchCollection mc11 = Regex.Matches(amstr, @"[a-zA-z]+://[^\s]*", RegexOptions.Singleline);
foreach (Match m1 in mc11)
{
string linkurlstr = string.Empty;
linkurlstr = m1.Value.Replace("\"", "").Replace("'", "").Replace(">", "").Replace(";", "");
weburlSB.Append("$-$");
weburlSB.Append(linkurlstr);
weburlSB.Append("$_$");
if (!Weburllist.Contains(linkurlstr) && !Weburllistzx.Contains(linkurlstr))
{
IsGenxin = true;
Weburllistzx.Add(linkurlstr);
linkSb.AppendFormat("{0}<br/>", linkurlstr);
}
}
}
}
Index++;
}
return Weburllistzx;
}
這塊的技術(shù)其實(shí)就是簡(jiǎn)單的使用了正則去匹配!接下來獻(xiàn)上獲取標(biāo)題,以及存儲(chǔ)到xml文件的方法
/// <summary>
/// // 把網(wǎng)址寫入xml文件
/// </summary>
/// <param name="strURL"></param>
/// <param name="alHyperLinks"></param>
private static void WriteToXml(string strURL, List<string> alHyperLinks)
{
XmlTextWriter writer = new XmlTextWriter(@"D:\HyperLinks.xml", Encoding.UTF8);
writer.Formatting = Formatting.Indented;
writer.WriteStartDocument(false);
writer.WriteDocType("HyperLinks", null, "urls.dtd", null);
writer.WriteComment("提取自" + strURL + "的超鏈接");
writer.WriteStartElement("HyperLinks");
writer.WriteStartElement("HyperLinks", null);
writer.WriteAttributeString("DateTime", DateTime.Now.ToString());
foreach (string str in alHyperLinks)
{
string title = GetDomain(str);
string body = str;
writer.WriteElementString(title, null, body);
}
writer.WriteEndElement();
writer.WriteEndElement();
writer.Flush();
writer.Close();
}
/// <summary>
/// 獲取網(wǎng)址的域名后綴
/// </summary>
/// <param name="strURL"></param>
/// <returns></returns>
private static string GetDomain(string strURL)
{
string retVal;
string strRegex = @"(\.com/|\.net/|\.cn/|\.org/|\.gov/)";
Regex r = new Regex(strRegex, RegexOptions.IgnoreCase);
Match m = r.Match(strURL);
retVal = m.ToString();
strRegex = @"\.|/$";
retVal = Regex.Replace(retVal, strRegex, "").ToString();
if (retVal == "")
retVal = "other";
return retVal;
}
/// <summary>
/// 獲取標(biāo)題
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private static string GetTitle(string html)
{
string titleFilter = @"<title>[\s\S]*?</title>";
string h1Filter = @"<h1.*?>.*?</h1>";
string clearFilter = @"<.*?>";
string title = "";
Match match = Regex.Match(html, titleFilter, RegexOptions.IgnoreCase);
if (match.Success)
{
title = Regex.Replace(match.Groups[0].Value, clearFilter, "");
}
// 正文的標(biāo)題一般在h1中,比title中的標(biāo)題更干凈
match = Regex.Match(html, h1Filter, RegexOptions.IgnoreCase);
if (match.Success)
{
string h1 = Regex.Replace(match.Groups[0].Value, clearFilter, "");
if (!String.IsNullOrEmpty(h1) && title.StartsWith(h1))
{
title = h1;
}
}
return title;
}
這就是所用的全部方法,還是有很多需要改進(jìn)之處!大家如果有發(fā)現(xiàn)不足之處還請(qǐng)指出,謝謝!
以上就是本文的全部?jī)?nèi)容,希望對(duì)大家的學(xué)習(xí)有所幫助,也希望大家多多支持我們。
上一篇:C#向線程中傳遞多個(gè)參數(shù)的解決方法(兩種)
欄 目:C#教程
下一篇:深入理解C#中的Delegate
本文標(biāo)題:C#網(wǎng)絡(luò)爬蟲代碼分享 C#簡(jiǎn)單的爬取工具
本文地址:http://www.jygsgssxh.com/a1/C_jiaocheng/6332.html
您可能感興趣的文章
- 01-10C#一個(gè)簡(jiǎn)單的定時(shí)小程序?qū)崿F(xiàn)代碼
- 01-10C#獲取網(wǎng)頁(yè)源代碼的方法
- 01-1010個(gè)C#程序員經(jīng)常用到的實(shí)用代碼片段
- 01-10C#導(dǎo)出網(wǎng)站功能實(shí)例代碼講解
- 01-10C#代碼實(shí)現(xiàn)PDF文檔操作類
- 01-10C#超實(shí)用代碼段合集
- 01-10分享WCF聊天程序--WCFChat實(shí)現(xiàn)代碼
- 01-10Silverlight將圖片轉(zhuǎn)換為byte的實(shí)現(xiàn)代碼
- 01-10使用C#代碼獲取存儲(chǔ)過程返回值
- 01-10C#代碼實(shí)現(xiàn)對(duì)AES加密解密


閱讀排行
- 1C語(yǔ)言 while語(yǔ)句的用法詳解
- 2java 實(shí)現(xiàn)簡(jiǎn)單圣誕樹的示例代碼(圣誕
- 3利用C語(yǔ)言實(shí)現(xiàn)“百馬百擔(dān)”問題方法
- 4C語(yǔ)言中計(jì)算正弦的相關(guān)函數(shù)總結(jié)
- 5c語(yǔ)言計(jì)算三角形面積代碼
- 6什么是 WSH(腳本宿主)的詳細(xì)解釋
- 7C++ 中隨機(jī)函數(shù)random函數(shù)的使用方法
- 8正則表達(dá)式匹配各種特殊字符
- 9C語(yǔ)言十進(jìn)制轉(zhuǎn)二進(jìn)制代碼實(shí)例
- 10C語(yǔ)言查找數(shù)組里數(shù)字重復(fù)次數(shù)的方法
本欄相關(guān)
- 01-10C#通過反射獲取當(dāng)前工程中所有窗體并
- 01-10關(guān)于ASP網(wǎng)頁(yè)無法打開的解決方案
- 01-10WinForm限制窗體不能移到屏幕外的方法
- 01-10WinForm繪制圓角的方法
- 01-10C#實(shí)現(xiàn)txt定位指定行完整實(shí)例
- 01-10WinForm實(shí)現(xiàn)仿視頻播放器左下角滾動(dòng)新
- 01-10C#停止線程的方法
- 01-10C#實(shí)現(xiàn)清空回收站的方法
- 01-10C#通過重寫Panel改變邊框顏色與寬度的
- 01-10C#實(shí)現(xiàn)讀取注冊(cè)表監(jiān)控當(dāng)前操作系統(tǒng)已
隨機(jī)閱讀
- 01-11ajax實(shí)現(xiàn)頁(yè)面的局部加載
- 01-10SublimeText編譯C開發(fā)環(huán)境設(shè)置
- 01-11Mac OSX 打開原生自帶讀寫NTFS功能(圖文
- 08-05dedecms(織夢(mèng))副欄目數(shù)量限制代碼修改
- 01-10使用C語(yǔ)言求解撲克牌的順子及n個(gè)骰子
- 01-10C#中split用法實(shí)例總結(jié)
- 04-02jquery與jsp,用jquery
- 08-05織夢(mèng)dedecms什么時(shí)候用欄目交叉功能?
- 08-05DEDE織夢(mèng)data目錄下的sessions文件夾有什
- 01-10delphi制作wav文件的方法


