天天躁日日躁狠狠躁AV麻豆-天天躁人人躁人人躁狂躁-天天澡夜夜澡人人澡-天天影视香色欲综合网-国产成人女人在线视频观看-国产成人女人视频在线观看

asp.net(c#)做一個網頁數據采集工具

通過這個軟件一兩天就完成了幾千產品數據的錄入,可見很多工作不是一味用人工去做,作為一個程序員,就是要讓很多讓那些經常做重復性的、繁瑣的工作中的人解放出來。下面只是寫了一些核心代碼,而且采集必須要和對應網站相掛鉤,作者:鄭少群

復制代碼 代碼如下:
//提取產品列表頁中產品最終頁的網頁
private void button1_Click(object sender, EventArgs e)
{
if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")
{
MessageBox.Show("網址和域名不能為空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
return;
}
try
{
string Html = inc.GetHtml("http://study.pctoday.NET.cn");
//ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");
ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取鏈接


" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
StringBuilder sb = new StringBuilder();
foreach (object var in al)
{
string a = var.ToString().Replace("/"", "").Replace("'", "");
a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);
if (a.StartsWith("/"))
a = textBox2.Text.Trim() + a;
if (!a.StartsWith("http://"))
a = "http://" + a;
sb.Append(a + "/r/n");
}
textBox5.Text = sb.ToString();//把提取到網址輸出到一個textBox,每個鏈接占一行



MessageBox.Show("共提取" + al.Count.ToString() + "個鏈接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

}
catch (Exception err)
{
MessageBox.Show("提取出錯!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);
}

}




//把采集的產品頁面html代碼進行字符串處理,提取需要的代碼,最后保存到本地一個access數據庫中,同時提取產品圖片地址并自動現在圖片到本地images文件夾下

private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)
{
//填充產品表
Database.ExecuteNonQuery("delete from Tb_Product");
DataTable dt2 = new DataTable();
OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);
OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);
OleDbCommandBuilder cb = new OleDbCommandBuilder(da);
da.Fill(dt2);
dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//這個是做一個進度條

string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(',');
DataTable dt = new DataTable();
StringBuilder ErrorStr = new StringBuilder();
string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

//循環每次采集網址
for (int i = 0; i < Urls.Length; i++)
{
try
{
if (!worker.CancellationPending)
{
if (Urls[i] == "")
return;
html = inc.GetHtml(Urls[i]);//獲取該url的html代碼
DataRow NewRow = dt2.NewRow();

//產品名
string ProductName = html.Substring(html.IndexOf("<title>") + 7);
NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//產品編號
NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

//產品介紹,這些都是根據不同網站的html做相應的修改
string Introduce = html.Substring(html.IndexOf("Product Details") + 26);
Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

NewRow["Introduce"] = Introduce;



" title="Replica Watches:">Replica Watches Buy Full Quality Popular Luxury Watches at Amazing Price, Your One Stop Discount Swiss Watches StoreExclusive Replica Rolex Watches, Tag Heuer Watches Replica, Cartier Watches online Sale!
//下載圖片
string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);
ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=/"") + 5);
ProductImage = ProductImage.Remove(ProductImage.IndexOf("/""));
try
{
inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));
}
catch (Exception)
{
ErrorStr.Append("下載圖片失敗,圖片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "/r/n");
}


dt2.Rows.Add(NewRow);

//Thread.Sleep(100);
worker.ReportProgress((i + 1) * 100 / Urls.Length, i);
toolStripStatusLabel1.Text = "處理進度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//進度條
}

}
catch (Exception err)
{
ErrorStr.Append("采集錯誤:" + err.Message + ";網址:" + Urls[i] + "/r/n");
}
}
da.Update(dt2);
DataBind(dt2);
ShowError(ErrorStr.ToString());
}

/// <summary>
/// ASPX頁面生成靜態Html頁面,作者:鄭少群
/// </summary>
public static string GetHtml(string url)
{
StreamReader sr = null;
string str = null;
//讀取遠程路徑
WebRequest request = WebRequest.Create(url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));
str = sr.ReadToEnd();
sr.Close();
return str;
}


// 提取HTML代碼中的網址
public static ArrayList GetMatchesStr(string htmlCode, string strRegex)
{
ArrayList al = new ArrayList();

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);
MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i < m.Count; i++)
{
bool rep = false;
string strNew = m[i].ToString();

// 過濾重復的URL
foreach (string str in al)
{
if (strNew == str)
{
rep = true;
break;
}
}

if (!rep) al.Add(strNew);
}

al.Sort();

return al;
}

public static void DownFile(string Url, string Path)
{

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream stream = response.GetResponseStream();
long size = response.ContentLength;
//創建文件流對象
using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))
{
byte[] b = new byte[1025];
int n = 0;
while ((n = stream.Read(b, 0, 1024)) > 0)
{
fs.Write(b, 0, n);
}
}
}

AspNet技術asp.net(c#)做一個網頁數據采集工具,轉載需保留來源!

鄭重聲明:本文版權歸原作者所有,轉載文章僅為傳播更多信息之目的,如作者信息標記有誤,請第一時間聯系我們修改或刪除,多謝。

主站蜘蛛池模板: 国产AV视频二区在线观看 | bbw美女与zooxx | 国产成人免费片在线视频观看 | 日韩精品真人荷官无码 | 欧美性狂猛bbbbbbxxxx | 亚洲无人区码二码三码区别图 | 午夜国产一区在线观看 | 动漫女主被扒开双腿羞辱 | 久久艹伊人 | 久久99r66热这里只有精品 | 青青草原国产在线观看 | 亚洲午夜久久久无码精品网红A片 | 伊人久久精品AV一区二区 | 中文字幕视频在线观看 | 全彩无翼污之邪恶女教师 | 久久人妻少妇嫩草AV蜜桃35I | 妻子的秘密HD观看 | yw193龙物免费官网在线 | a视频在线免费观看 | 日本夜爽爽一区二区三区 | 耽肉高h喷汁呻吟 | 国产偷国产偷亚洲高清app | 干了快生了的孕妇 | 中文字幕久久久 | 97色伦在色在线播放 | 6080yy奇领电影在线看 | 中国女人内谢69XXXXXA片 | 国产成人精品免费视频大 | 亚洲手机在线人成视频 | 秋霞电影午夜伦午夜 | 亚洲大片免费观看 | BL文库好大粗黑强强肉NP | 欧美日韩黄色 | 我的家庭女教师 | yin乱教师系列合集 yin荡体育课羞耻play双性 | 毛片免费在线视频 | 视频一区二区三区蜜桃麻豆 | 好紧的小嫩嫩17p | yy4408午夜场理论片 | 午夜视频无码国产在线观看 | 搞av.com|