基于C#實現(xiàn)網(wǎng)頁爬蟲
本文實例為大家分享了基于C#實現(xiàn)網(wǎng)頁爬蟲的詳細(xì)代碼,供大家參考,具體內(nèi)容如下
HTTP請求工具類:
功能:
1、獲取網(wǎng)頁html
2、下載網(wǎng)絡(luò)圖片
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using System.Threading.Tasks;
using System.Windows.Forms;
namespace Utils
{
/// <summary>
/// HTTP請求工具類
/// </summary>
public class HttpRequestUtil
{
/// <summary>
/// 獲取頁面html
/// </summary>
public static string GetPageHtml(string url)
{
// 設(shè)置參數(shù)
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
//發(fā)送請求并獲取相應(yīng)回應(yīng)數(shù)據(jù)
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
//直到request.GetResponse()程序才開始向目標(biāo)網(wǎng)頁發(fā)送Post請求
Stream responseStream = response.GetResponseStream();
StreamReader sr = new StreamReader(responseStream, Encoding.UTF8);
//返回結(jié)果網(wǎng)頁(html)代碼
string content = sr.ReadToEnd();
return content;
}
/// <summary>
/// Http下載文件
/// </summary>
public static void HttpDownloadFile(string url)
{
int pos = url.LastIndexOf("/") + 1;
string fileName = url.Substring(pos);
string path = Application.StartupPath + "\\download";
if (!Directory.Exists(path))
{
Directory.CreateDirectory(path);
}
string filePathName = path + "\\" + fileName;
if (File.Exists(filePathName)) return;
// 設(shè)置參數(shù)
HttpWebRequest request = WebRequest.Create(url) as HttpWebRequest;
request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)";
request.Proxy = null;
//發(fā)送請求并獲取相應(yīng)回應(yīng)數(shù)據(jù)
HttpWebResponse response = request.GetResponse() as HttpWebResponse;
//直到request.GetResponse()程序才開始向目標(biāo)網(wǎng)頁發(fā)送Post請求
Stream responseStream = response.GetResponseStream();
//創(chuàng)建本地文件寫入流
Stream stream = new FileStream(filePathName, FileMode.Create);
byte[] bArr = new byte[1024];
int size = responseStream.Read(bArr, 0, (int)bArr.Length);
while (size > 0)
{
stream.Write(bArr, 0, size);
size = responseStream.Read(bArr, 0, (int)bArr.Length);
}
stream.Close();
responseStream.Close();
}
}
}
多線程爬取網(wǎng)頁代碼:
using System;
using System.Collections.Generic;
using System.ComponentModel;
using System.Data;
using System.Drawing;
using System.IO;
using System.Linq;
using System.Text;
using System.Text.RegularExpressions;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Utils;
namespace 爬蟲
{
public partial class Form1 : Form
{
List<Thread> threadList = new List<Thread>();
Thread thread = null;
public Form1()
{
InitializeComponent();
}
private void button1_Click(object sender, EventArgs e)
{
DateTime dtStart = DateTime.Now;
button3.Enabled = true;
button2.Enabled = true;
button1.Enabled = false;
int page = 0;
int count = 0;
int personCount = 0;
lblPage.Text = "已完成頁數(shù):0";
int index = 0;
for (int i = 1; i <= 10; i++)
{
thread = new Thread(new ParameterizedThreadStart(delegate(object obj)
{
for (int j = 1; j <= 10; j++)
{
try
{
index = (Convert.ToInt32(obj) - 1) * 10 + j;
string pageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com/c44/0/1_" + index.ToString() + ".html");
Regex regA = new Regex("<a[\\s]+class=\"J-userPic([^<>]*?)[\\s]+href=\"([^\"]*?)\"");
Regex regImg = new Regex("<p class=\"tc mb10\"><img[\\s]+src=\"([^\"]*?)\"");
MatchCollection mc = regA.Matches(pageHtml);
foreach (Match match in mc)
{
int start = match.ToString().IndexOf("href=\"");
string url = match.ToString().Substring(start + 6);
int end = url.IndexOf("\"");
url = url.Substring(0, end);
if (url.IndexOf("/") == 0)
{
string imgPageHtml = HttpRequestUtil.GetPageHtml("http://tt.mop.com" + url);
personCount++;
lblPerson.Invoke(new Action(delegate() { lblPerson.Text = "已完成條數(shù):" + personCount.ToString(); }));
MatchCollection mcImgPage = regImg.Matches(imgPageHtml);
foreach (Match matchImgPage in mcImgPage)
{
start = matchImgPage.ToString().IndexOf("src=\"");
string imgUrl = matchImgPage.ToString().Substring(start + 5);
end = imgUrl.IndexOf("\"");
imgUrl = imgUrl.Substring(0, end);
if (imgUrl.IndexOf("http://i1") == 0)
{
try
{
HttpRequestUtil.HttpDownloadFile(imgUrl);
count++;
lblNum.Invoke(new Action(delegate()
{
lblNum.Text = "已下載圖片數(shù)" + count.ToString();
DateTime dt = DateTime.Now;
double time = dt.Subtract(dtStart).TotalSeconds;
if (time > 0)
{
lblSpeed.Text = "速度:" + (count / time).ToString("0.0") + "張/秒";
}
}));
}
catch { }
Thread.Sleep(1);
}
}
}
}
}
catch { }
page++;
lblPage.Invoke(new Action(delegate() { lblPage.Text = "已完成頁數(shù):" + page.ToString(); }));
if (page == 100)
{
button1.Invoke(new Action(delegate() { button1.Enabled = true; }));
MessageBox.Show("完成!");
}
}
}));
thread.Start(i);
threadList.Add(thread);
}
}
private void button2_Click(object sender, EventArgs e)
{
button1.Invoke(new Action(delegate()
{
foreach (Thread thread in threadList)
{
if (thread.ThreadState == ThreadState.Suspended)
{
thread.Resume();
}
thread.Abort();
}
button1.Enabled = true;
button2.Enabled = false;
button3.Enabled = false;
button4.Enabled = false;
}));
}
private void Form1_FormClosing(object sender, FormClosingEventArgs e)
{
foreach (Thread thread in threadList)
{
thread.Abort();
}
}
private void button3_Click(object sender, EventArgs e)
{
foreach (Thread thread in threadList)
{
if (thread.ThreadState == ThreadState.Running)
{
thread.Suspend();
}
}
button3.Enabled = false;
button4.Enabled = true;
}
private void button4_Click(object sender, EventArgs e)
{
foreach (Thread thread in threadList)
{
if (thread.ThreadState == ThreadState.Suspended)
{
thread.Resume();
}
}
button3.Enabled = true;
button4.Enabled = false;
}
}
}
截圖:
以上就是本文的全部內(nèi)容,希望對大家的學(xué)習(xí)有所幫助。
上一篇:C#6.0中10大新特性的應(yīng)用和總結(jié)
欄 目:C#教程
下一篇:C#常見的幾種集合 ArrayList,Hashtable,List&lt;T&gt;,
本文標(biāo)題:基于C#實現(xiàn)網(wǎng)頁爬蟲
本文地址:http://www.jygsgssxh.com/a1/C_jiaocheng/6618.html
您可能感興趣的文章
- 01-10關(guān)于ASP網(wǎng)頁無法打開的解決方案
- 01-10C#實現(xiàn)txt定位指定行完整實例
- 01-10WinForm實現(xiàn)仿視頻播放器左下角滾動新聞效果的方法
- 01-10C#實現(xiàn)清空回收站的方法
- 01-10C#實現(xiàn)讀取注冊表監(jiān)控當(dāng)前操作系統(tǒng)已安裝軟件變化的方法
- 01-10C#實現(xiàn)多線程下載文件的方法
- 01-10C#實現(xiàn)Winform中打開網(wǎng)頁頁面的方法
- 01-10C#實現(xiàn)遠(yuǎn)程關(guān)閉計算機或重啟計算機的方法
- 01-10C#自定義簽名章實現(xiàn)方法
- 01-10C#文件斷點續(xù)傳實現(xiàn)方法


閱讀排行
本欄相關(guān)
- 01-10C#通過反射獲取當(dāng)前工程中所有窗體并
- 01-10關(guān)于ASP網(wǎng)頁無法打開的解決方案
- 01-10WinForm限制窗體不能移到屏幕外的方法
- 01-10WinForm繪制圓角的方法
- 01-10C#實現(xiàn)txt定位指定行完整實例
- 01-10WinForm實現(xiàn)仿視頻播放器左下角滾動新
- 01-10C#停止線程的方法
- 01-10C#實現(xiàn)清空回收站的方法
- 01-10C#通過重寫Panel改變邊框顏色與寬度的
- 01-10C#實現(xiàn)讀取注冊表監(jiān)控當(dāng)前操作系統(tǒng)已
隨機閱讀
- 01-11Mac OSX 打開原生自帶讀寫NTFS功能(圖文
- 01-10delphi制作wav文件的方法
- 08-05DEDE織夢data目錄下的sessions文件夾有什
- 01-11ajax實現(xiàn)頁面的局部加載
- 01-10SublimeText編譯C開發(fā)環(huán)境設(shè)置
- 08-05織夢dedecms什么時候用欄目交叉功能?
- 01-10C#中split用法實例總結(jié)
- 01-10使用C語言求解撲克牌的順子及n個骰子
- 08-05dedecms(織夢)副欄目數(shù)量限制代碼修改
- 04-02jquery與jsp,用jquery


